1 /* 2 * Copyright (c) 2021, Amazon.com, Inc. or its affiliates. All rights reserved. 3 * 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #ifndef SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP 27 #define SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP 28 29 // Terminology used within this source file: 30 // 31 // Card Entry: This is the information that identifies whether a 32 // particular card-table entry is Clean or Dirty. A clean 33 // card entry denotes that the associated memory does not 34 // hold references to young-gen memory. 35 // 36 // Card Region, aka 37 // Card Memory: This is the region of memory that is assocated with a 38 // particular card entry. 39 // 40 // Card Cluster: A card cluster represents 64 card entries. A card 41 // cluster is the minimal amount of work performed at a 42 // time by a parallel thread. Note that the work required 43 // to scan a card cluster is somewhat variable in that the 44 // required effort depends on how many cards are dirty, how 45 // many references are held within the objects that span a 46 // DIRTY card's memory, and on the size of the object 47 // that spans the end of a DIRTY card's memory (because 48 // that object, if it's not an array, may need to be scanned in 49 // its entirety, when the object is imprecisely dirtied. Imprecise 50 // dirtying is when the card corresponding to the object header 51 // is dirtied, rather than the card on which the updated field lives). 52 // To better balance work amongst them, parallel worker threads dynamically 53 // claim clusters and are flexible in the number of clusters they 54 // process. 55 // 56 // A cluster represents a "natural" quantum of work to be performed by 57 // a parallel GC thread's background remembered set scanning efforts. 58 // The notion of cluster is similar to the notion of stripe in the 59 // implementation of parallel GC card scanning. However, a cluster is 60 // typically smaller than a stripe, enabling finer grain division of 61 // labor between multiple threads, and potentially better load balancing 62 // when dirty cards are not uniformly distributed in the heap, as is often 63 // the case with generational workloads where more recently promoted objects 64 // may be dirtied more frequently that older objects. 65 // 66 // For illustration, consider the following possible JVM configurations: 67 // 68 // Scenario 1: 69 // RegionSize is 128 MB 70 // Span of a card entry is 512 B 71 // Each card table entry consumes 1 B 72 // Assume one long word (8 B)of the card table represents a cluster. 73 // This long word holds 8 card table entries, spanning a 74 // total of 8*512 B = 4 KB of the heap 75 // The number of clusters per region is 128 MB / 4 KB = 32 K 76 // 77 // Scenario 2: 78 // RegionSize is 128 MB 79 // Span of each card entry is 128 B 80 // Each card table entry consumes 1 bit 81 // Assume one int word (4 B) of the card table represents a cluster. 82 // This int word holds 32 b/1 b = 32 card table entries, spanning a 83 // total of 32 * 128 B = 4 KB of the heap 84 // The number of clusters per region is 128 MB / 4 KB = 32 K 85 // 86 // Scenario 3: 87 // RegionSize is 128 MB 88 // Span of each card entry is 512 B 89 // Each card table entry consumes 1 bit 90 // Assume one long word (8 B) of card table represents a cluster. 91 // This long word holds 64 b/ 1 b = 64 card table entries, spanning a 92 // total of 64 * 512 B = 32 KB of the heap 93 // The number of clusters per region is 128 MB / 32 KB = 4 K 94 // 95 // At the start of a new young-gen concurrent mark pass, the gang of 96 // Shenandoah worker threads collaborate in performing the following 97 // actions: 98 // 99 // Let old_regions = number of ShenandoahHeapRegion comprising 100 // old-gen memory 101 // Let region_size = ShenandoahHeapRegion::region_size_bytes() 102 // represent the number of bytes in each region 103 // Let clusters_per_region = region_size / 512 104 // Let rs represent the relevant RememberedSet implementation 105 // (an instance of ShenandoahDirectCardMarkRememberedSet or an instance 106 // of a to-be-implemented ShenandoahBufferWithSATBRememberedSet) 107 // 108 // for each ShenandoahHeapRegion old_region in the whole heap 109 // determine the cluster number of the first cluster belonging 110 // to that region 111 // for each cluster contained within that region 112 // Assure that exactly one worker thread processes each 113 // cluster, each thread making a series of invocations of the 114 // following: 115 // 116 // rs->process_clusters(worker_id, ReferenceProcessor *, 117 // ShenandoahConcurrentMark *, cluster_no, cluster_count, 118 // HeapWord *end_of_range, OopClosure *oops); 119 // 120 // For efficiency, divide up the clusters so that different threads 121 // are responsible for processing different clusters. Processing costs 122 // may vary greatly between clusters for the following reasons: 123 // 124 // a) some clusters contain mostly dirty cards and other 125 // clusters contain mostly clean cards 126 // b) some clusters contain mostly primitive data and other 127 // clusters contain mostly reference data 128 // c) some clusters are spanned by very large non-array objects that 129 // begin in some other cluster. When a large non-array object 130 // beginning in a preceding cluster spans large portions of 131 // this cluster, then because of imprecise dirtying, the 132 // portion of the object in this cluster may be clean, but 133 // will need to be processed by the worker responsible for 134 // this cluster, potentially increasing its work. 135 // d) in the case that the end of this cluster is spanned by a 136 // very large non-array object, the worker for this cluster will 137 // be responsible for processing the portion of the object 138 // in this cluster. 139 // 140 // Though an initial division of labor between marking threads may 141 // assign equal numbers of clusters to be scanned by each thread, it 142 // should be expected that some threads will finish their assigned 143 // work before others. Therefore, some amount of the full remembered 144 // set scanning effort should be held back and assigned incrementally 145 // to the threads that end up with excess capacity. Consider the 146 // following strategy for dividing labor: 147 // 148 // 1. Assume there are 8 marking threads and 1024 remembered 149 // set clusters to be scanned. 150 // 2. Assign each thread to scan 64 clusters. This leaves 151 // 512 (1024 - (8*64)) clusters to still be scanned. 152 // 3. As the 8 server threads complete previous cluster 153 // scanning assignments, issue each of the next 8 scanning 154 // assignments as units of 32 additional cluster each. 155 // In the case that there is high variance in effort 156 // associated with previous cluster scanning assignments, 157 // multiples of these next assignments may be serviced by 158 // the server threads that were previously assigned lighter 159 // workloads. 160 // 4. Make subsequent scanning assignments as follows: 161 // a) 8 assignments of size 16 clusters 162 // b) 8 assignments of size 8 clusters 163 // c) 16 assignments of size 4 clusters 164 // 165 // When there is no more remembered set processing work to be 166 // assigned to a newly idled worker thread, that thread can move 167 // on to work on other tasks associated with root scanning until such 168 // time as all clusters have been examined. 169 // 170 // Remembered set scanning is designed to run concurrently with 171 // mutator threads, with multiple concurrent workers. Furthermore, the 172 // current implementation of remembered set scanning never clears a 173 // card once it has been marked. 174 // 175 // These limitations will be addressed in future enhancements to the 176 // existing implementation. 177 178 #include <stdint.h> 179 #include "gc/shared/workerThread.hpp" 180 #include "gc/shenandoah/shenandoahCardStats.hpp" 181 #include "gc/shenandoah/shenandoahCardTable.hpp" 182 #include "gc/shenandoah/shenandoahHeap.hpp" 183 #include "gc/shenandoah/shenandoahHeapRegion.hpp" 184 #include "gc/shenandoah/shenandoahNumberSeq.hpp" 185 #include "gc/shenandoah/shenandoahTaskqueue.hpp" 186 #include "memory/iterator.hpp" 187 188 class ShenandoahReferenceProcessor; 189 class ShenandoahConcurrentMark; 190 class ShenandoahHeap; 191 class ShenandoahRegionIterator; 192 class ShenandoahMarkingContext; 193 194 class CardTable; 195 typedef CardTable::CardValue CardValue; 196 197 class ShenandoahDirectCardMarkRememberedSet: public CHeapObj<mtGC> { 198 199 private: 200 201 // Use symbolic constants defined in cardTable.hpp 202 // CardTable::card_shift = 9; 203 // CardTable::card_size = 512; 204 // CardTable::card_size_in_words = 64; 205 // CardTable::clean_card_val() 206 // CardTable::dirty_card_val() 207 208 ShenandoahHeap *_heap; 209 ShenandoahCardTable *_card_table; 210 size_t _card_shift; 211 size_t _total_card_count; 212 size_t _cluster_count; 213 HeapWord *_whole_heap_base; // Points to first HeapWord of data contained within heap memory 214 CardValue* _byte_map; // Points to first entry within the card table 215 CardValue* _byte_map_base; // Points to byte_map minus the bias computed from address of heap memory 216 217 public: 218 219 // count is the number of cards represented by the card table. 220 ShenandoahDirectCardMarkRememberedSet(ShenandoahCardTable *card_table, size_t total_card_count); 221 ~ShenandoahDirectCardMarkRememberedSet(); 222 223 // Card index is zero-based relative to _byte_map. 224 size_t last_valid_index() const; 225 size_t total_cards() const; 226 size_t card_index_for_addr(HeapWord *p) const; 227 HeapWord *addr_for_card_index(size_t card_index) const; 228 inline const CardValue* get_card_table_byte_map(bool write_table) const; 229 inline bool is_card_dirty(size_t card_index) const; 230 inline bool is_write_card_dirty(size_t card_index) const; 231 inline void mark_card_as_dirty(size_t card_index); 232 inline void mark_range_as_dirty(size_t card_index, size_t num_cards); 233 inline void mark_card_as_clean(size_t card_index); 234 inline void mark_read_card_as_clean(size_t card_index); 235 inline void mark_range_as_clean(size_t card_index, size_t num_cards); 236 inline bool is_card_dirty(HeapWord *p) const; 237 inline void mark_card_as_dirty(HeapWord *p); 238 inline void mark_range_as_dirty(HeapWord *p, size_t num_heap_words); 239 inline void mark_card_as_clean(HeapWord *p); 240 inline void mark_range_as_clean(HeapWord *p, size_t num_heap_words); 241 inline size_t cluster_count() const; 242 243 // Called by GC thread at start of concurrent mark to exchange roles of read and write remembered sets. 244 // Not currently used because mutator write barrier does not honor changes to the location of card table. 245 void swap_remset() { _card_table->swap_card_tables(); } 246 247 void merge_write_table(HeapWord* start, size_t word_count) { 248 size_t card_index = card_index_for_addr(start); 249 size_t num_cards = word_count / CardTable::card_size_in_words(); 250 size_t iterations = num_cards / (sizeof (intptr_t) / sizeof (CardValue)); 251 intptr_t* read_table_ptr = (intptr_t*) &(_card_table->read_byte_map())[card_index]; 252 intptr_t* write_table_ptr = (intptr_t*) &(_card_table->write_byte_map())[card_index]; 253 for (size_t i = 0; i < iterations; i++) { 254 intptr_t card_value = *write_table_ptr; 255 *read_table_ptr++ &= card_value; 256 write_table_ptr++; 257 } 258 } 259 260 // Instead of swap_remset, the current implementation of concurrent remembered set scanning does reset_remset 261 // in parallel threads, each invocation processing one entire HeapRegion at a time. Processing of a region 262 // consists of copying the write table to the read table and cleaning the write table. 263 void reset_remset(HeapWord* start, size_t word_count) { 264 size_t card_index = card_index_for_addr(start); 265 size_t num_cards = word_count / CardTable::card_size_in_words(); 266 size_t iterations = num_cards / (sizeof (intptr_t) / sizeof (CardValue)); 267 intptr_t* read_table_ptr = (intptr_t*) &(_card_table->read_byte_map())[card_index]; 268 intptr_t* write_table_ptr = (intptr_t*) &(_card_table->write_byte_map())[card_index]; 269 for (size_t i = 0; i < iterations; i++) { 270 *read_table_ptr++ = *write_table_ptr; 271 *write_table_ptr++ = CardTable::clean_card_row_val(); 272 } 273 } 274 275 // Called by GC thread after scanning old remembered set in order to prepare for next GC pass 276 void clear_old_remset() { _card_table->clear_read_table(); } 277 278 }; 279 280 // A ShenandoahCardCluster represents the minimal unit of work 281 // performed by independent parallel GC threads during scanning of 282 // remembered sets. 283 // 284 // The GC threads that perform card-table remembered set scanning may 285 // overwrite card-table entries to mark them as clean in the case that 286 // the associated memory no longer holds references to young-gen 287 // memory. Rather than access the card-table entries directly, all GC 288 // thread access to card-table information is made by way of the 289 // ShenandoahCardCluster data abstraction. This abstraction 290 // effectively manages access to multiple possible underlying 291 // remembered set implementations, including a traditional card-table 292 // approach and a SATB-based approach. 293 // 294 // The API services represent a compromise between efficiency and 295 // convenience. 296 // 297 // Multiple GC threads that scan the remembered set 298 // in parallel. The desire is to divide the complete scanning effort 299 // into multiple clusters of work that can be independently processed 300 // by individual threads without need for synchronizing efforts 301 // between the work performed by each task. The term "cluster" of 302 // work is similar to the term "stripe" as used in the implementation 303 // of Parallel GC. 304 // 305 // Complexity arises when an object to be scanned crosses the boundary 306 // between adjacent cluster regions. Here is the protocol that we currently 307 // follow: 308 // 309 // 1. The thread responsible for scanning the cards in a cluster modifies 310 // the associated card-table entries. Only cards that are dirty are 311 // processed, except as described below for the case of objects that 312 // straddle more than one card. 313 // 2. Object Arrays are precisely dirtied, so only the portion of the obj-array 314 // that overlaps the range of dirty cards in its cluster are scanned 315 // by each worker thread. This holds for portions of obj-arrays that extend 316 // over clusters processed by different workers, with each worked responsible 317 // for scanning the portion of the obj-array overlapping the dirty cards in 318 // its cluster. 319 // 3. Non-array objects are precisely dirtied by the interpreter and the compilers 320 // For such objects that extend over multiple cards, or even multiple clusters, 321 // the entire object is scanned by the worker that processes the (dirty) card on 322 // which the object's header lies. (However, GC workers should precisely dirty the 323 // cards with inter-regional/inter-generational pointers in the body of this object, 324 // thus making subsequent scans potentially less expensive.) Such larger non-array 325 // objects are relatively rare. 326 // 327 // A possible criticism: 328 // C. The representation of pointer location descriptive information 329 // within Klass representations is not designed for efficient 330 // "random access". An alternative approach to this design would 331 // be to scan very large objects multiple times, once for each 332 // cluster that is spanned by the object's range. This reduces 333 // unnecessary overscan, but it introduces different sorts of 334 // overhead effort: 335 // i) For each spanned cluster, we have to look up the start of 336 // the crossing object. 337 // ii) Each time we scan the very large object, we have to 338 // sequentially walk through its pointer location 339 // descriptors, skipping over all of the pointers that 340 // precede the start of the range of addresses that we 341 // consider relevant. 342 343 344 // Because old-gen heap memory is not necessarily contiguous, and 345 // because cards are not necessarily maintained for young-gen memory, 346 // consecutive card numbers do not necessarily correspond to consecutive 347 // address ranges. For the traditional direct-card-marking 348 // implementation of this interface, consecutive card numbers are 349 // likely to correspond to contiguous regions of memory, but this 350 // should not be assumed. Instead, rely only upon the following: 351 // 352 // 1. All card numbers for cards pertaining to the same 353 // ShenandoahHeapRegion are consecutively numbered. 354 // 2. In the case that neighboring ShenandoahHeapRegions both 355 // represent old-gen memory, the card regions that span the 356 // boundary between these neighboring heap regions will be 357 // consecutively numbered. 358 // 3. (A corollary) In the case that an old-gen object straddles the 359 // boundary between two heap regions, the card regions that 360 // correspond to the span of this object will be consecutively 361 // numbered. 362 // 363 // ShenandoahCardCluster abstracts access to the remembered set 364 // and also keeps track of crossing map information to allow efficient 365 // resolution of object start addresses. 366 // 367 // ShenandoahCardCluster supports all of the services of 368 // RememberedSet, plus it supports register_object() and lookup_object(). 369 // Note that we only need to register the start addresses of the object that 370 // overlays the first address of a card; we need to do this for every card. 371 // In other words, register_object() checks if the object crosses a card boundary, 372 // and updates the offset value for each card that the object crosses into. 373 // For objects that don't straddle cards, nothing needs to be done. 374 // 375 // The RememberedSet template parameter is intended to represent either 376 // ShenandoahDirectCardMarkRememberedSet, or a to-be-implemented 377 // ShenandoahBufferWithSATBRememberedSet. 378 template<typename RememberedSet> 379 class ShenandoahCardCluster: public CHeapObj<mtGC> { 380 381 private: 382 RememberedSet *_rs; 383 384 public: 385 static const size_t CardsPerCluster = 64; 386 387 private: 388 typedef struct cross_map { uint8_t first; uint8_t last; } xmap; 389 typedef union crossing_info { uint16_t short_word; xmap offsets; } crossing_info; 390 391 // ObjectStartsInCardRegion bit is set within a crossing_info.offsets.start iff at least one object starts within 392 // a particular card region. We pack this bit into start byte under assumption that start byte is accessed less 393 // frequently than last byte. This is true when number of clean cards is greater than number of dirty cards. 394 static const uint16_t ObjectStartsInCardRegion = 0x80; 395 static const uint16_t FirstStartBits = 0x3f; 396 397 crossing_info *object_starts; 398 399 public: 400 // If we're setting first_start, assume the card has an object. 401 inline void set_first_start(size_t card_index, uint8_t value) { 402 object_starts[card_index].offsets.first = ObjectStartsInCardRegion | value; 403 } 404 405 inline void set_last_start(size_t card_index, uint8_t value) { 406 object_starts[card_index].offsets.last = value; 407 } 408 409 inline void set_starts_object_bit(size_t card_index) { 410 object_starts[card_index].offsets.first |= ObjectStartsInCardRegion; 411 } 412 413 inline void clear_starts_object_bit(size_t card_index) { 414 object_starts[card_index].offsets.first &= ~ObjectStartsInCardRegion; 415 } 416 417 // Returns true iff an object is known to start within the card memory associated with card card_index. 418 inline bool starts_object(size_t card_index) const { 419 return (object_starts[card_index].offsets.first & ObjectStartsInCardRegion) != 0; 420 } 421 422 inline void clear_objects_in_range(HeapWord *addr, size_t num_words) { 423 size_t card_index = _rs->card_index_for_addr(addr); 424 size_t last_card_index = _rs->card_index_for_addr(addr + num_words - 1); 425 while (card_index <= last_card_index) 426 object_starts[card_index++].short_word = 0; 427 } 428 429 ShenandoahCardCluster(RememberedSet *rs) { 430 _rs = rs; 431 // TODO: We don't really need object_starts entries for every card entry. We only need these for 432 // the card entries that correspond to old-gen memory. But for now, let's be quick and dirty. 433 object_starts = NEW_C_HEAP_ARRAY(crossing_info, rs->total_cards(), mtGC); 434 for (size_t i = 0; i < rs->total_cards(); i++) { 435 object_starts[i].short_word = 0; 436 } 437 } 438 439 ~ShenandoahCardCluster() { 440 FREE_C_HEAP_ARRAY(crossing_info, object_starts); 441 object_starts = nullptr; 442 } 443 444 // There is one entry within the object_starts array for each card entry. 445 // 446 // Suppose multiple garbage objects are coalesced during GC sweep 447 // into a single larger "free segment". As each two objects are 448 // coalesced together, the start information pertaining to the second 449 // object must be removed from the objects_starts array. If the 450 // second object had been been the first object within card memory, 451 // the new first object is the object that follows that object if 452 // that starts within the same card memory, or NoObject if the 453 // following object starts within the following cluster. If the 454 // second object had been the last object in the card memory, 455 // replace this entry with the newly coalesced object if it starts 456 // within the same card memory, or with NoObject if it starts in a 457 // preceding card's memory. 458 // 459 // Suppose a large free segment is divided into a smaller free 460 // segment and a new object. The second part of the newly divided 461 // memory must be registered as a new object, overwriting at most 462 // one first_start and one last_start entry. Note that one of the 463 // newly divided two objects might be a new GCLAB. 464 // 465 // Suppose postprocessing of a GCLAB finds that the original GCLAB 466 // has been divided into N objects. Each of the N newly allocated 467 // objects will be registered, overwriting at most one first_start 468 // and one last_start entries. 469 // 470 // No object registration operations are linear in the length of 471 // the registered objects. 472 // 473 // Consider further the following observations regarding object 474 // registration costs: 475 // 476 // 1. The cost is paid once for each old-gen object (Except when 477 // an object is demoted and repromoted, in which case we would 478 // pay the cost again). 479 // 2. The cost can be deferred so that there is no urgency during 480 // mutator copy-on-first-access promotion. Background GC 481 // threads will update the object_starts array by post- 482 // processing the contents of retired PLAB buffers. 483 // 3. The bet is that these costs are paid relatively rarely 484 // because: 485 // a) Most objects die young and objects that die in young-gen 486 // memory never need to be registered with the object_starts 487 // array. 488 // b) Most objects that are promoted into old-gen memory live 489 // there without further relocation for a relatively long 490 // time, so we get a lot of benefit from each investment 491 // in registering an object. 492 493 public: 494 495 // The starting locations of objects contained within old-gen memory 496 // are registered as part of the remembered set implementation. This 497 // information is required when scanning dirty card regions that are 498 // spanned by objects beginning within preceding card regions. It 499 // is necessary to find the first and last objects that begin within 500 // this card region. Starting addresses of objects are required to 501 // find the object headers, and object headers provide information 502 // about which fields within the object hold addresses. 503 // 504 // The old-gen memory allocator invokes register_object() for any 505 // object that is allocated within old-gen memory. This identifies 506 // the starting addresses of objects that span boundaries between 507 // card regions. 508 // 509 // It is not necessary to invoke register_object at the very instant 510 // an object is allocated. It is only necessary to invoke it 511 // prior to the next start of a garbage collection concurrent mark 512 // or concurrent update-references phase. An "ideal" time to register 513 // objects is during post-processing of a GCLAB after the GCLAB is 514 // retired due to depletion of its memory. 515 // 516 // register_object() does not perform synchronization. In the case 517 // that multiple threads are registering objects whose starting 518 // addresses are within the same cluster, races between these 519 // threads may result in corruption of the object-start data 520 // structures. Parallel GC threads should avoid registering objects 521 // residing within the same cluster by adhering to the following 522 // coordination protocols: 523 // 524 // 1. Align thread-local GCLAB buffers with some TBD multiple of 525 // card clusters. The card cluster size is 32 KB. If the 526 // desired GCLAB size is 128 KB, align the buffer on a multiple 527 // of 4 card clusters. 528 // 2. Post-process the contents of GCLAB buffers to register the 529 // objects allocated therein. Allow one GC thread at a 530 // time to do the post-processing of each GCLAB. 531 // 3. Since only one GC thread at a time is registering objects 532 // belonging to a particular allocation buffer, no locking 533 // is performed when registering these objects. 534 // 4. Any remnant of unallocated memory within an expended GC 535 // allocation buffer is not returned to the old-gen allocation 536 // pool until after the GC allocation buffer has been post 537 // processed. Before any remnant memory is returned to the 538 // old-gen allocation pool, the GC thread that scanned this GC 539 // allocation buffer performs a write-commit memory barrier. 540 // 5. Background GC threads that perform tenuring of young-gen 541 // objects without a GCLAB use a CAS lock before registering 542 // each tenured object. The CAS lock assures both mutual 543 // exclusion and memory coherency/visibility. Note that an 544 // object tenured by a background GC thread will not overlap 545 // with any of the clusters that are receiving tenured objects 546 // by way of GCLAB buffers. Multiple independent GC threads may 547 // attempt to tenure objects into a shared cluster. This is why 548 // sychronization may be necessary. Consider the following 549 // scenarios: 550 // 551 // a) If two objects are tenured into the same card region, each 552 // registration may attempt to modify the first-start or 553 // last-start information associated with that card region. 554 // Furthermore, because the representations of first-start 555 // and last-start information within the object_starts array 556 // entry uses different bits of a shared uint_16 to represent 557 // each, it is necessary to lock the entire card entry 558 // before modifying either the first-start or last-start 559 // information within the entry. 560 // b) Suppose GC thread X promotes a tenured object into 561 // card region A and this tenured object spans into 562 // neighboring card region B. Suppose GC thread Y (not equal 563 // to X) promotes a tenured object into cluster B. GC thread X 564 // will update the object_starts information for card A. No 565 // synchronization is required. 566 // c) In summary, when background GC threads register objects 567 // newly tenured into old-gen memory, they must acquire a 568 // mutual exclusion lock on the card that holds the starting 569 // address of the newly tenured object. This can be achieved 570 // by using a CAS instruction to assure that the previous 571 // values of first-offset and last-offset have not been 572 // changed since the same thread inquired as to their most 573 // current values. 574 // 575 // One way to minimize the need for synchronization between 576 // background tenuring GC threads is for each tenuring GC thread 577 // to promote young-gen objects into distinct dedicated cluster 578 // ranges. 579 // 6. The object_starts information is only required during the 580 // starting of concurrent marking and concurrent evacuation 581 // phases of GC. Before we start either of these GC phases, the 582 // JVM enters a safe point and all GC threads perform 583 // commit-write barriers to assure that access to the 584 // object_starts information is coherent. 585 586 587 // Notes on synchronization of register_object(): 588 // 589 // 1. For efficiency, there is no locking in the implementation of register_object() 590 // 2. Thus, it is required that users of this service assure that concurrent/parallel invocations of 591 // register_object() do pertain to the same card's memory range. See discussion below to understand 592 // the risks. 593 // 3. When allocating from a TLAB or GCLAB, the mutual exclusion can be guaranteed by assuring that each 594 // LAB's start and end are aligned on card memory boundaries. 595 // 4. Use the same lock that guarantees exclusivity when performing free-list allocation within heap regions. 596 // 597 // Register the newly allocated object while we're holding the global lock since there's no synchronization 598 // built in to the implementation of register_object(). There are potential races when multiple independent 599 // threads are allocating objects, some of which might span the same card region. For example, consider 600 // a card table's memory region within which three objects are being allocated by three different threads: 601 // 602 // objects being "concurrently" allocated: 603 // [-----a------][-----b-----][--------------c------------------] 604 // [---- card table memory range --------------] 605 // 606 // Before any objects are allocated, this card's memory range holds no objects. Note that: 607 // allocation of object a wants to set the has-object, first-start, and last-start attributes of the preceding card region. 608 // allocation of object b wants to set the has-object, first-start, and last-start attributes of this card region. 609 // allocation of object c also wants to set the has-object, first-start, and last-start attributes of this card region. 610 // 611 // The thread allocating b and the thread allocating c can "race" in various ways, resulting in confusion, such as last-start 612 // representing object b while first-start represents object c. This is why we need to require all register_object() 613 // invocations associated with objects that are allocated from "free lists" to provide their own mutual exclusion locking 614 // mechanism. 615 616 // Reset the starts_object() information to false for all cards in the range between from and to. 617 void reset_object_range(HeapWord *from, HeapWord *to); 618 619 // register_object() requires that the caller hold the heap lock 620 // before calling it. 621 void register_object(HeapWord* address); 622 623 // register_object_wo_lock() does not require that the caller hold 624 // the heap lock before calling it, under the assumption that the 625 // caller has assure no other thread will endeavor to concurrently 626 // register objects that start within the same card's memory region 627 // as address. 628 void register_object_wo_lock(HeapWord* address); 629 630 // During the reference updates phase of GC, we walk through each old-gen memory region that was 631 // not part of the collection set and we invalidate all unmarked objects. As part of this effort, 632 // we coalesce neighboring dead objects in order to make future remembered set scanning more 633 // efficient (since future remembered set scanning of any card region containing consecutive 634 // dead objects can skip over all of them at once by reading only a single dead object header 635 // instead of having to read the header of each of the coalesced dead objects. 636 // 637 // At some future time, we may implement a further optimization: satisfy future allocation requests 638 // by carving new objects out of the range of memory that represents the coalesced dead objects. 639 // 640 // Suppose we want to combine several dead objects into a single coalesced object. How does this 641 // impact our representation of crossing map information? 642 // 1. If the newly coalesced range is contained entirely within a card range, that card's last 643 // start entry either remains the same or it is changed to the start of the coalesced region. 644 // 2. For the card that holds the start of the coalesced object, it will not impact the first start 645 // but it may impact the last start. 646 // 3. For following cards spanned entirely by the newly coalesced object, it will change starts_object 647 // to false (and make first-start and last-start "undefined"). 648 // 4. For a following card that is spanned patially by the newly coalesced object, it may change 649 // first-start value, but it will not change the last-start value. 650 // 651 // The range of addresses represented by the arguments to coalesce_objects() must represent a range 652 // of memory that was previously occupied exactly by one or more previously registered objects. For 653 // convenience, it is legal to invoke coalesce_objects() with arguments that span a single previously 654 // registered object. 655 // 656 // The role of coalesce_objects is to change the crossing map information associated with all of the coalesced 657 // objects. 658 void coalesce_objects(HeapWord* address, size_t length_in_words); 659 660 // The typical use case is going to look something like this: 661 // for each heapregion that comprises old-gen memory 662 // for each card number that corresponds to this heap region 663 // scan the objects contained therein if the card is dirty 664 // To avoid excessive lookups in a sparse array, the API queries 665 // the card number pertaining to a particular address and then uses the 666 // card number for subsequent information lookups and stores. 667 668 // If starts_object(card_index), this returns the word offset within this card 669 // memory at which the first object begins. If !starts_object(card_index), the 670 // result is a don't care value -- asserts in a debug build. 671 size_t get_first_start(size_t card_index) const; 672 673 // If starts_object(card_index), this returns the word offset within this card 674 // memory at which the last object begins. If !starts_object(card_index), the 675 // result is a don't care value. 676 size_t get_last_start(size_t card_index) const; 677 678 679 // Given a card_index, return the starting address of the first block in the heap 680 // that straddles into the card. If the card is co-initial with an object, then 681 // this would return the starting address of the heap that this card covers. 682 // Expects to be called for a card affiliated with the old generation in 683 // generational mode. 684 HeapWord* block_start(size_t card_index) const; 685 }; 686 687 // ShenandoahScanRemembered is a concrete class representing the 688 // ability to scan the old-gen remembered set for references to 689 // objects residing in young-gen memory. 690 // 691 // Scanning normally begins with an invocation of numRegions and ends 692 // after all clusters of all regions have been scanned. 693 // 694 // Throughout the scanning effort, the number of regions does not 695 // change. 696 // 697 // Even though the regions that comprise old-gen memory are not 698 // necessarily contiguous, the abstraction represented by this class 699 // identifies each of the old-gen regions with an integer value 700 // in the range from 0 to (numRegions() - 1) inclusive. 701 // 702 703 template<typename RememberedSet> 704 class ShenandoahScanRemembered: public CHeapObj<mtGC> { 705 706 private: 707 RememberedSet* _rs; 708 ShenandoahCardCluster<RememberedSet>* _scc; 709 710 // Global card stats (cumulative) 711 HdrSeq _card_stats_scan_rs[MAX_CARD_STAT_TYPE]; 712 HdrSeq _card_stats_update_refs[MAX_CARD_STAT_TYPE]; 713 // Per worker card stats (multiplexed by phase) 714 HdrSeq** _card_stats; 715 716 // The types of card metrics that we gather 717 const char* _card_stats_name[MAX_CARD_STAT_TYPE] = { 718 "dirty_run", "clean_run", 719 "dirty_cards", "clean_cards", 720 "max_dirty_run", "max_clean_run", 721 "dirty_scan_objs", 722 "alternations" 723 }; 724 725 // The statistics are collected and logged separately for 726 // card-scans for initial marking, and for updating refs. 727 const char* _card_stat_log_type[MAX_CARD_STAT_LOG_TYPE] = { 728 "Scan Remembered Set", "Update Refs" 729 }; 730 731 int _card_stats_log_counter[2] = {0, 0}; 732 733 public: 734 // How to instantiate this object? 735 // ShenandoahDirectCardMarkRememberedSet *rs = 736 // new ShenandoahDirectCardMarkRememberedSet(); 737 // scr = new 738 // ShenandoahScanRememberd<ShenandoahDirectCardMarkRememberedSet>(rs); 739 // 740 // or, after the planned implementation of 741 // ShenandoahBufferWithSATBRememberedSet has been completed: 742 // 743 // ShenandoahBufferWithSATBRememberedSet *rs = 744 // new ShenandoahBufferWithSATBRememberedSet(); 745 // scr = new 746 // ShenandoahScanRememberd<ShenandoahBufferWithSATBRememberedSet>(rs); 747 748 749 ShenandoahScanRemembered(RememberedSet *rs) { 750 _rs = rs; 751 _scc = new ShenandoahCardCluster<RememberedSet>(rs); 752 753 // We allocate ParallelGCThreads worth even though we usually only 754 // use up to ConcGCThreads, because degenerate collections may employ 755 // ParallelGCThreads for remembered set scanning. 756 if (ShenandoahEnableCardStats) { 757 _card_stats = NEW_C_HEAP_ARRAY(HdrSeq*, ParallelGCThreads, mtGC); 758 for (uint i = 0; i < ParallelGCThreads; i++) { 759 _card_stats[i] = new HdrSeq[MAX_CARD_STAT_TYPE]; 760 } 761 } else { 762 _card_stats = nullptr; 763 } 764 } 765 766 ~ShenandoahScanRemembered() { 767 delete _scc; 768 if (ShenandoahEnableCardStats) { 769 for (uint i = 0; i < ParallelGCThreads; i++) { 770 delete _card_stats[i]; 771 } 772 FREE_C_HEAP_ARRAY(HdrSeq*, _card_stats); 773 _card_stats = nullptr; 774 } 775 assert(_card_stats == nullptr, "Error"); 776 } 777 778 HdrSeq* card_stats(uint worker_id) { 779 assert(worker_id < ParallelGCThreads, "Error"); 780 assert(ShenandoahEnableCardStats == (_card_stats != nullptr), "Error"); 781 return ShenandoahEnableCardStats ? _card_stats[worker_id] : nullptr; 782 } 783 784 HdrSeq* card_stats_for_phase(CardStatLogType t) { 785 switch (t) { 786 case CARD_STAT_SCAN_RS: 787 return _card_stats_scan_rs; 788 case CARD_STAT_UPDATE_REFS: 789 return _card_stats_update_refs; 790 default: 791 guarantee(false, "No such CardStatLogType"); 792 } 793 return nullptr; // Quiet compiler 794 } 795 796 // TODO: We really don't want to share all of these APIs with arbitrary consumers of the ShenandoahScanRemembered abstraction. 797 // But in the spirit of quick and dirty for the time being, I'm going to go ahead and publish everything for right now. Some 798 // of existing code already depends on having access to these services (because existing code has not been written to honor 799 // full abstraction of remembered set scanning. In the not too distant future, we want to try to make most, if not all, of 800 // these services private. Two problems with publicizing: 801 // 1. Allowing arbitrary users to reach beneath the hood allows the users to make assumptions about underlying implementation. 802 // This will make it more difficult to change underlying implementation at a future time, such as when we eventually experiment 803 // with SATB-based implementation of remembered set representation. 804 // 2. If we carefully control sharing of certain of these services, we can reduce the overhead of synchronization by assuring 805 // that all users follow protocols that avoid contention that might require synchronization. When we publish these APIs, we 806 // lose control over who and how the data is accessed. As a result, we are required to insert more defensive measures into 807 // the implementation, including synchronization locks. 808 809 810 // Card index is zero-based relative to first spanned card region. 811 size_t last_valid_index(); 812 size_t total_cards(); 813 size_t card_index_for_addr(HeapWord *p); 814 HeapWord *addr_for_card_index(size_t card_index); 815 bool is_card_dirty(size_t card_index); 816 bool is_write_card_dirty(size_t card_index) { return _rs->is_write_card_dirty(card_index); } 817 void mark_card_as_dirty(size_t card_index); 818 void mark_range_as_dirty(size_t card_index, size_t num_cards); 819 void mark_card_as_clean(size_t card_index); 820 void mark_read_card_as_clean(size_t card_index) { _rs->mark_read_card_clean(card_index); } 821 void mark_range_as_clean(size_t card_index, size_t num_cards); 822 bool is_card_dirty(HeapWord *p); 823 void mark_card_as_dirty(HeapWord *p); 824 void mark_range_as_dirty(HeapWord *p, size_t num_heap_words); 825 void mark_card_as_clean(HeapWord *p); 826 void mark_range_as_clean(HeapWord *p, size_t num_heap_words); 827 size_t cluster_count(); 828 829 // Called by GC thread at start of concurrent mark to exchange roles of read and write remembered sets. 830 void swap_remset() { _rs->swap_remset(); } 831 832 void reset_remset(HeapWord* start, size_t word_count) { _rs->reset_remset(start, word_count); } 833 834 void merge_write_table(HeapWord* start, size_t word_count) { _rs->merge_write_table(start, word_count); } 835 836 // Called by GC thread after scanning old remembered set in order to prepare for next GC pass 837 void clear_old_remset() { _rs->clear_old_remset(); } 838 839 size_t cluster_for_addr(HeapWord *addr); 840 HeapWord* addr_for_cluster(size_t cluster_no); 841 842 void reset_object_range(HeapWord *from, HeapWord *to); 843 void register_object(HeapWord *addr); 844 void register_object_wo_lock(HeapWord *addr); 845 void coalesce_objects(HeapWord *addr, size_t length_in_words); 846 847 HeapWord* first_object_in_card(size_t card_index) { 848 if (_scc->starts_object(card_index)) { 849 return addr_for_card_index(card_index) + _scc->get_first_start(card_index); 850 } else { 851 return nullptr; 852 } 853 } 854 855 // Return true iff this object is "properly" registered. 856 bool verify_registration(HeapWord* address, ShenandoahMarkingContext* ctx); 857 858 // clear the cards to clean, and clear the object_starts info to no objects 859 void mark_range_as_empty(HeapWord *addr, size_t length_in_words); 860 861 // process_clusters() scans a portion of the remembered set 862 // for references from old gen into young. Several worker threads 863 // scan different portions of the remembered set by making parallel invocations 864 // of process_clusters() with each invocation scanning different 865 // "clusters" of the remembered set. 866 // 867 // An invocation of process_clusters() examines all of the 868 // intergenerational references spanned by `count` clusters starting 869 // with `first_cluster`. The `oops` argument is a worker-thread-local 870 // OopClosure that is applied to all "valid" references in the remembered set. 871 // 872 // A side-effect of executing process_clusters() is to update the remembered 873 // set entries (e.g. marking dirty cards clean if they no longer 874 // hold references to young-gen memory). 875 // 876 // An implementation of process_clusters() may choose to efficiently 877 // address more typical scenarios in the structure of remembered sets. E.g. 878 // in the generational setting, one might expect remembered sets to be very sparse 879 // (low mutation rates in the old generation leading to sparse dirty cards, 880 // each with very few intergenerational pointers). Specific implementations 881 // may choose to degrade gracefully as the sparsity assumption fails to hold, 882 // such as when there are sudden spikes in (premature) promotion or in the 883 // case of an underprovisioned, poorly-tuned, or poorly-shaped heap. 884 // 885 // At the start of a concurrent young generation marking cycle, we invoke process_clusters 886 // with ClosureType ShenandoahInitMarkRootsClosure. 887 // 888 // At the start of a concurrent evacuation phase, we invoke process_clusters with 889 // ClosureType ShenandoahEvacuateUpdateRootsClosure. 890 891 // All template expansions require methods to be defined in the inline.hpp file, but larger 892 // such methods need not be declared as inline. 893 template <typename ClosureType> 894 void process_clusters(size_t first_cluster, size_t count, HeapWord *end_of_range, ClosureType *oops, 895 bool use_write_table, uint worker_id); 896 897 template <typename ClosureType> 898 inline void process_humongous_clusters(ShenandoahHeapRegion* r, size_t first_cluster, size_t count, 899 HeapWord *end_of_range, ClosureType *oops, bool use_write_table); 900 901 template <typename ClosureType> 902 inline void process_region_slice(ShenandoahHeapRegion* region, size_t offset, size_t clusters, HeapWord* end_of_range, 903 ClosureType *cl, bool use_write_table, uint worker_id); 904 905 // To Do: 906 // Create subclasses of ShenandoahInitMarkRootsClosure and 907 // ShenandoahEvacuateUpdateRootsClosure and any other closures 908 // that need to participate in remembered set scanning. Within the 909 // subclasses, add a (probably templated) instance variable that 910 // refers to the associated ShenandoahCardCluster object. Use this 911 // ShenandoahCardCluster instance to "enhance" the do_oops 912 // processing so that we can: 913 // 914 // 1. Avoid processing references that correspond to clean card 915 // regions, and 916 // 2. Set card status to CLEAN when the associated card region no 917 // longer holds inter-generatioanal references. 918 // 919 // To enable efficient implementation of these behaviors, we 920 // probably also want to add a few fields into the 921 // ShenandoahCardCluster object that allow us to precompute and 922 // remember the addresses at which card status is going to change 923 // from dirty to clean and clean to dirty. The do_oops 924 // implementations will want to update this value each time they 925 // cross one of these boundaries. 926 void roots_do(OopIterateClosure* cl); 927 928 // Log stats related to card/RS stats for given phase t 929 void log_card_stats(uint nworkers, CardStatLogType t) PRODUCT_RETURN; 930 private: 931 // Log stats for given worker id related into given cumulative card/RS stats 932 void log_worker_card_stats(uint worker_id, HdrSeq* cum_stats) PRODUCT_RETURN; 933 934 // Log given stats 935 inline void log_card_stats(HdrSeq* stats) PRODUCT_RETURN; 936 937 // Merge the stats from worked_id into the given summary stats, and clear the worker_id's stats. 938 void merge_worker_card_stats_cumulative(HdrSeq* worker_stats, HdrSeq* cum_stats) PRODUCT_RETURN; 939 }; 940 941 942 // A ShenandoahRegionChunk represents a contiguous interval of a ShenandoahHeapRegion, typically representing 943 // work to be done by a worker thread. 944 struct ShenandoahRegionChunk { 945 ShenandoahHeapRegion *_r; // The region of which this represents a chunk 946 size_t _chunk_offset; // HeapWordSize offset 947 size_t _chunk_size; // HeapWordSize qty 948 }; 949 950 // ShenandoahRegionChunkIterator divides the total remembered set scanning effort into ShenandoahRegionChunks 951 // that are assigned one at a time to worker threads. (Here, we use the terms `assignments` and `chunks` 952 // interchangeably.) Note that the effort required to scan a range of memory is not necessarily a linear 953 // function of the size of the range. Some memory ranges hold only a small number of live objects. 954 // Some ranges hold primarily primitive (non-pointer) data. We start with larger chunk sizes because larger chunks 955 // reduce coordination overhead. We expect that the GC worker threads that receive more difficult assignments 956 // will work longer on those chunks. Meanwhile, other worker will threads repeatedly accept and complete multiple 957 // easier chunks. As the total amount of work remaining to be completed decreases, we decrease the size of chunks 958 // given to individual threads. This reduces the likelihood of significant imbalance between worker thread assignments 959 // when there is less meaningful work to be performed by the remaining worker threads while they wait for 960 // worker threads with difficult assignments to finish, reducing the overall duration of the phase. 961 962 class ShenandoahRegionChunkIterator : public StackObj { 963 private: 964 // The largest chunk size is 4 MiB, measured in words. Otherwise, remembered set scanning may become too unbalanced. 965 // If the largest chunk size is too small, there is too much overhead sifting out assignments to individual worker threads. 966 static const size_t _maximum_chunk_size_words = (4 * 1024 * 1024) / HeapWordSize; 967 968 static const size_t _clusters_in_smallest_chunk = 4; 969 970 // smallest_chunk_size is 4 clusters. Each cluster spans 128 KiB. 971 // This is computed from CardTable::card_size_in_words() * 972 // ShenandoahCardCluster<ShenandoahDirectCardMarkRememberedSet>::CardsPerCluster; 973 static const size_t smallest_chunk_size_words() { 974 return _clusters_in_smallest_chunk * CardTable::card_size_in_words() * 975 ShenandoahCardCluster<ShenandoahDirectCardMarkRememberedSet>::CardsPerCluster; 976 } 977 978 // The total remembered set scanning effort is divided into chunks of work that are assigned to individual worker tasks. 979 // The chunks of assigned work are divided into groups, where the size of the typical group (_regular_group_size) is half the 980 // total number of regions. The first group may be larger than 981 // _regular_group_size in the case that the first group's chunk 982 // size is less than the region size. The last group may be larger 983 // than _regular_group_size because no group is allowed to 984 // have smaller assignments than _smallest_chunk_size, which is 128 KB. 985 986 // Under normal circumstances, no configuration needs more than _maximum_groups (default value of 16). 987 // The first group "effectively" processes chunks of size 1 MiB (or smaller for smaller region sizes). 988 // The last group processes chunks of size 128 KiB. There are four groups total. 989 990 // group[0] is 4 MiB chunk size (_maximum_chunk_size_words) 991 // group[1] is 2 MiB chunk size 992 // group[2] is 1 MiB chunk size 993 // group[3] is 512 KiB chunk size 994 // group[4] is 256 KiB chunk size 995 // group[5] is 128 Kib shunk size (_smallest_chunk_size_words = 4 * 64 * 64 996 static const size_t _maximum_groups = 6; 997 998 const ShenandoahHeap* _heap; 999 1000 const size_t _regular_group_size; // Number of chunks in each group 1001 const size_t _first_group_chunk_size_b4_rebalance; 1002 const size_t _num_groups; // Number of groups in this configuration 1003 const size_t _total_chunks; 1004 1005 shenandoah_padding(0); 1006 volatile size_t _index; 1007 shenandoah_padding(1); 1008 1009 size_t _region_index[_maximum_groups]; // The region index for the first region spanned by this group 1010 size_t _group_offset[_maximum_groups]; // The offset at which group begins within first region spanned by this group 1011 size_t _group_chunk_size[_maximum_groups]; // The size of each chunk within this group 1012 size_t _group_entries[_maximum_groups]; // Total chunks spanned by this group and the ones before it. 1013 1014 // No implicit copying: iterators should be passed by reference to capture the state 1015 NONCOPYABLE(ShenandoahRegionChunkIterator); 1016 1017 // Makes use of _heap. 1018 size_t calc_regular_group_size(); 1019 1020 // Makes use of _regular_group_size, which must be initialized before call. 1021 size_t calc_first_group_chunk_size_b4_rebalance(); 1022 1023 // Makes use of _regular_group_size and _first_group_chunk_size_b4_rebalance, both of which must be initialized before call. 1024 size_t calc_num_groups(); 1025 1026 // Makes use of _regular_group_size, _first_group_chunk_size_b4_rebalance, which must be initialized before call. 1027 size_t calc_total_chunks(); 1028 1029 public: 1030 ShenandoahRegionChunkIterator(size_t worker_count); 1031 ShenandoahRegionChunkIterator(ShenandoahHeap* heap, size_t worker_count); 1032 1033 // Reset iterator to default state 1034 void reset(); 1035 1036 // Fills in assignment with next chunk of work and returns true iff there is more work. 1037 // Otherwise, returns false. This is multi-thread-safe. 1038 inline bool next(struct ShenandoahRegionChunk *assignment); 1039 1040 // This is *not* MT safe. However, in the absence of multithreaded access, it 1041 // can be used to determine if there is more work to do. 1042 inline bool has_next() const; 1043 }; 1044 1045 typedef ShenandoahScanRemembered<ShenandoahDirectCardMarkRememberedSet> RememberedScanner; 1046 1047 class ShenandoahScanRememberedTask : public WorkerTask { 1048 private: 1049 ShenandoahObjToScanQueueSet* _queue_set; 1050 ShenandoahObjToScanQueueSet* _old_queue_set; 1051 ShenandoahReferenceProcessor* _rp; 1052 ShenandoahRegionChunkIterator* _work_list; 1053 bool _is_concurrent; 1054 1055 public: 1056 ShenandoahScanRememberedTask(ShenandoahObjToScanQueueSet* queue_set, 1057 ShenandoahObjToScanQueueSet* old_queue_set, 1058 ShenandoahReferenceProcessor* rp, 1059 ShenandoahRegionChunkIterator* work_list, 1060 bool is_concurrent); 1061 1062 void work(uint worker_id); 1063 void do_work(uint worker_id); 1064 }; 1065 1066 // Verify that the oop doesn't point into the young generation 1067 class ShenandoahVerifyNoYoungRefsClosure: public BasicOopIterateClosure { 1068 ShenandoahHeap* _heap; 1069 template<class T> void work(T* p); 1070 1071 public: 1072 ShenandoahVerifyNoYoungRefsClosure(); 1073 1074 virtual void do_oop(narrowOop* p) { work(p); } 1075 virtual void do_oop(oop* p) { work(p); } 1076 }; 1077 1078 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP