1 /*
2 * Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP
26 #define SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP
27
28 // Terminology used within this source file:
29 //
30 // Card Entry: This is the information that identifies whether a
31 // particular card-table entry is Clean or Dirty. A clean
32 // card entry denotes that the associated memory does not
33 // hold references to young-gen memory.
34 //
35 // Card Region, aka
36 // Card Memory: This is the region of memory that is assocated with a
37 // particular card entry.
38 //
39 // Card Cluster: A card cluster represents 64 card entries. A card
40 // cluster is the minimal amount of work performed at a
41 // time by a parallel thread. Note that the work required
42 // to scan a card cluster is somewhat variable in that the
43 // required effort depends on how many cards are dirty, how
44 // many references are held within the objects that span a
45 // DIRTY card's memory, and on the size of the object
46 // that spans the end of a DIRTY card's memory (because
47 // that object, if it's not an array, may need to be scanned in
48 // its entirety, when the object is imprecisely dirtied. Imprecise
49 // dirtying is when the card corresponding to the object header
50 // is dirtied, rather than the card on which the updated field lives).
51 // To better balance work amongst them, parallel worker threads dynamically
52 // claim clusters and are flexible in the number of clusters they
53 // process.
54 //
55 // A cluster represents a "natural" quantum of work to be performed by
56 // a parallel GC thread's background remembered set scanning efforts.
57 // The notion of cluster is similar to the notion of stripe in the
58 // implementation of parallel GC card scanning. However, a cluster is
59 // typically smaller than a stripe, enabling finer grain division of
60 // labor between multiple threads, and potentially better load balancing
61 // when dirty cards are not uniformly distributed in the heap, as is often
62 // the case with generational workloads where more recently promoted objects
63 // may be dirtied more frequently that older objects.
64 //
65 // For illustration, consider the following possible JVM configurations:
66 //
67 // Scenario 1:
68 // RegionSize is 128 MB
69 // Span of a card entry is 512 B
70 // Each card table entry consumes 1 B
71 // Assume one long word (8 B)of the card table represents a cluster.
72 // This long word holds 8 card table entries, spanning a
73 // total of 8*512 B = 4 KB of the heap
74 // The number of clusters per region is 128 MB / 4 KB = 32 K
75 //
76 // Scenario 2:
77 // RegionSize is 128 MB
78 // Span of each card entry is 128 B
79 // Each card table entry consumes 1 bit
80 // Assume one int word (4 B) of the card table represents a cluster.
81 // This int word holds 32 b/1 b = 32 card table entries, spanning a
82 // total of 32 * 128 B = 4 KB of the heap
83 // The number of clusters per region is 128 MB / 4 KB = 32 K
84 //
85 // Scenario 3:
86 // RegionSize is 128 MB
87 // Span of each card entry is 512 B
88 // Each card table entry consumes 1 bit
89 // Assume one long word (8 B) of card table represents a cluster.
90 // This long word holds 64 b/ 1 b = 64 card table entries, spanning a
91 // total of 64 * 512 B = 32 KB of the heap
92 // The number of clusters per region is 128 MB / 32 KB = 4 K
93 //
94 // At the start of a new young-gen concurrent mark pass, the gang of
95 // Shenandoah worker threads collaborate in performing the following
96 // actions:
97 //
98 // Let old_regions = number of ShenandoahHeapRegion comprising
99 // old-gen memory
100 // Let region_size = ShenandoahHeapRegion::region_size_bytes()
101 // represent the number of bytes in each region
102 // Let clusters_per_region = region_size / 512
103 // Let rs represent the ShenandoahDirectCardMarkRememberedSet
104 //
105 // for each ShenandoahHeapRegion old_region in the whole heap
106 // determine the cluster number of the first cluster belonging
107 // to that region
108 // for each cluster contained within that region
109 // Assure that exactly one worker thread processes each
110 // cluster, each thread making a series of invocations of the
111 // following:
112 //
113 // rs->process_clusters(worker_id, ReferenceProcessor *,
114 // ShenandoahConcurrentMark *, cluster_no, cluster_count,
115 // HeapWord *end_of_range, OopClosure *oops);
116 //
117 // For efficiency, divide up the clusters so that different threads
118 // are responsible for processing different clusters. Processing costs
119 // may vary greatly between clusters for the following reasons:
120 //
121 // a) some clusters contain mostly dirty cards and other
122 // clusters contain mostly clean cards
123 // b) some clusters contain mostly primitive data and other
124 // clusters contain mostly reference data
125 // c) some clusters are spanned by very large non-array objects that
126 // begin in some other cluster. When a large non-array object
127 // beginning in a preceding cluster spans large portions of
128 // this cluster, then because of imprecise dirtying, the
129 // portion of the object in this cluster may be clean, but
130 // will need to be processed by the worker responsible for
131 // this cluster, potentially increasing its work.
132 // d) in the case that the end of this cluster is spanned by a
133 // very large non-array object, the worker for this cluster will
134 // be responsible for processing the portion of the object
135 // in this cluster.
136 //
137 // Though an initial division of labor between marking threads may
138 // assign equal numbers of clusters to be scanned by each thread, it
139 // should be expected that some threads will finish their assigned
140 // work before others. Therefore, some amount of the full remembered
141 // set scanning effort should be held back and assigned incrementally
142 // to the threads that end up with excess capacity. Consider the
143 // following strategy for dividing labor:
144 //
145 // 1. Assume there are 8 marking threads and 1024 remembered
146 // set clusters to be scanned.
147 // 2. Assign each thread to scan 64 clusters. This leaves
148 // 512 (1024 - (8*64)) clusters to still be scanned.
149 // 3. As the 8 server threads complete previous cluster
150 // scanning assignments, issue each of the next 8 scanning
151 // assignments as units of 32 additional cluster each.
152 // In the case that there is high variance in effort
153 // associated with previous cluster scanning assignments,
154 // multiples of these next assignments may be serviced by
155 // the server threads that were previously assigned lighter
156 // workloads.
157 // 4. Make subsequent scanning assignments as follows:
158 // a) 8 assignments of size 16 clusters
159 // b) 8 assignments of size 8 clusters
160 // c) 16 assignments of size 4 clusters
161 //
162 // When there is no more remembered set processing work to be
163 // assigned to a newly idled worker thread, that thread can move
164 // on to work on other tasks associated with root scanning until such
165 // time as all clusters have been examined.
166 //
167 // Remembered set scanning is designed to run concurrently with
168 // mutator threads, with multiple concurrent workers. Furthermore, the
169 // current implementation of remembered set scanning never clears a
170 // card once it has been marked.
171 //
172 // These limitations will be addressed in future enhancements to the
173 // existing implementation.
174
175 #include "gc/shared/workerThread.hpp"
176 #include "gc/shenandoah/shenandoahCardStats.hpp"
177 #include "gc/shenandoah/shenandoahCardTable.hpp"
178 #include "gc/shenandoah/shenandoahNumberSeq.hpp"
179 #include "gc/shenandoah/shenandoahTaskqueue.hpp"
180 #include "memory/iterator.hpp"
181 #include "utilities/globalDefinitions.hpp"
182
183 class ShenandoahReferenceProcessor;
184 class ShenandoahConcurrentMark;
185 class ShenandoahHeap;
186 class ShenandoahHeapRegion;
187 class ShenandoahRegionIterator;
188 class ShenandoahMarkingContext;
189
190 class CardTable;
191 typedef CardTable::CardValue CardValue;
192
193 class ShenandoahDirectCardMarkRememberedSet: public CHeapObj<mtGC> {
194
195 private:
196
197 // Use symbolic constants defined in cardTable.hpp
198 // CardTable::card_shift = 9;
199 // CardTable::card_size = 512; (default value 512, a power of 2 >= 128)
200 // CardTable::card_size_in_words = 64; (default value 64, a power of 2 >= 16)
201 // CardTable::clean_card_val()
202 // CardTable::dirty_card_val()
203
204 // See shenandoahCardTable.hpp
205 // ShenandoahMinCardSizeInBytes 128
206
207 const size_t LogCardValsPerIntPtr; // the number of card values (entries) in an intptr_t
208 const size_t LogCardSizeInWords; // the size of a card in heap word units
209
210 ShenandoahHeap* _heap;
211 ShenandoahCardTable* _card_table;
212 size_t _card_shift;
213 size_t _total_card_count;
214 HeapWord* _whole_heap_base; // Points to first HeapWord of data contained within heap memory
215 CardValue* _byte_map; // Points to first entry within the card table
216 CardValue* _byte_map_base; // Points to byte_map minus the bias computed from address of heap memory
217
218 public:
219
220 // count is the number of cards represented by the card table.
221 ShenandoahDirectCardMarkRememberedSet(ShenandoahCardTable* card_table, size_t total_card_count);
222
223 // Card index is zero-based relative to _byte_map.
224 size_t last_valid_index() const;
225 size_t total_cards() const;
226 size_t card_index_for_addr(HeapWord* p) const;
227 HeapWord* addr_for_card_index(size_t card_index) const;
228 inline const CardValue* get_card_table_byte_map(bool use_write_table) const {
229 return use_write_table ? _card_table->write_byte_map() : _card_table->read_byte_map();
230 }
231
232 inline bool is_card_dirty(size_t card_index) const;
233 inline bool is_write_card_dirty(size_t card_index) const;
234 inline void mark_card_as_dirty(size_t card_index);
235 inline void mark_range_as_dirty(size_t card_index, size_t num_cards);
236 inline void mark_card_as_clean(size_t card_index);
237 inline void mark_range_as_clean(size_t card_index, size_t num_cards);
238 inline bool is_card_dirty(HeapWord* p) const;
239 inline bool is_write_card_dirty(HeapWord* p) const;
240 inline void mark_card_as_dirty(HeapWord* p);
241 inline void mark_range_as_dirty(HeapWord* p, size_t num_heap_words);
242 inline void mark_range_as_clean(HeapWord* p, size_t num_heap_words);
243
244 // See comment in ShenandoahScanRemembered
245 inline void mark_read_table_as_clean();
246
247 // Merge any dirty values from write table into the read table, while leaving
248 // the write table unchanged.
249 void merge_write_table(HeapWord* start, size_t word_count);
250
251 // See comment in ShenandoahScanRemembered
252 void swap_card_tables();
253 };
254
255 // A ShenandoahCardCluster represents the minimal unit of work
256 // performed by independent parallel GC threads during scanning of
257 // remembered sets.
258 //
259 // The GC threads that perform card-table remembered set scanning may
260 // overwrite card-table entries to mark them as clean in the case that
261 // the associated memory no longer holds references to young-gen
262 // memory. Rather than access the card-table entries directly, all GC
263 // thread access to card-table information is made by way of the
264 // ShenandoahCardCluster data abstraction. This abstraction
265 // effectively manages access to multiple possible underlying
266 // remembered set implementations, including a traditional card-table
267 // approach and a SATB-based approach.
268 //
269 // The API services represent a compromise between efficiency and
270 // convenience.
271 //
272 // Multiple GC threads that scan the remembered set
273 // in parallel. The desire is to divide the complete scanning effort
274 // into multiple clusters of work that can be independently processed
275 // by individual threads without need for synchronizing efforts
276 // between the work performed by each task. The term "cluster" of
277 // work is similar to the term "stripe" as used in the implementation
278 // of Parallel GC.
279 //
280 // Complexity arises when an object to be scanned crosses the boundary
281 // between adjacent cluster regions. Here is the protocol that we currently
282 // follow:
283 //
284 // 1. The thread responsible for scanning the cards in a cluster modifies
285 // the associated card-table entries. Only cards that are dirty are
286 // processed, except as described below for the case of objects that
287 // straddle more than one card.
288 // 2. Object Arrays are precisely dirtied, so only the portion of the obj-array
289 // that overlaps the range of dirty cards in its cluster are scanned
290 // by each worker thread. This holds for portions of obj-arrays that extend
291 // over clusters processed by different workers, with each worked responsible
292 // for scanning the portion of the obj-array overlapping the dirty cards in
293 // its cluster.
294 // 3. Non-array objects are precisely dirtied by the interpreter and the compilers
295 // For such objects that extend over multiple cards, or even multiple clusters,
296 // the entire object is scanned by the worker that processes the (dirty) card on
297 // which the object's header lies. (However, GC workers should precisely dirty the
298 // cards with inter-regional/inter-generational pointers in the body of this object,
299 // thus making subsequent scans potentially less expensive.) Such larger non-array
300 // objects are relatively rare.
301 //
302 // A possible criticism:
303 // C. The representation of pointer location descriptive information
304 // within Klass representations is not designed for efficient
305 // "random access". An alternative approach to this design would
306 // be to scan very large objects multiple times, once for each
307 // cluster that is spanned by the object's range. This reduces
308 // unnecessary overscan, but it introduces different sorts of
309 // overhead effort:
310 // i) For each spanned cluster, we have to look up the start of
311 // the crossing object.
312 // ii) Each time we scan the very large object, we have to
313 // sequentially walk through its pointer location
314 // descriptors, skipping over all of the pointers that
315 // precede the start of the range of addresses that we
316 // consider relevant.
317
318
319 // Because old-gen heap memory is not necessarily contiguous, and
320 // because cards are not necessarily maintained for young-gen memory,
321 // consecutive card numbers do not necessarily correspond to consecutive
322 // address ranges. For the traditional direct-card-marking
323 // implementation of this interface, consecutive card numbers are
324 // likely to correspond to contiguous regions of memory, but this
325 // should not be assumed. Instead, rely only upon the following:
326 //
327 // 1. All card numbers for cards pertaining to the same
328 // ShenandoahHeapRegion are consecutively numbered.
329 // 2. In the case that neighboring ShenandoahHeapRegions both
330 // represent old-gen memory, the card regions that span the
331 // boundary between these neighboring heap regions will be
332 // consecutively numbered.
333 // 3. (A corollary) In the case that an old-gen object straddles the
334 // boundary between two heap regions, the card regions that
335 // correspond to the span of this object will be consecutively
336 // numbered.
337 //
338 // ShenandoahCardCluster abstracts access to the remembered set
339 // and also keeps track of crossing map information to allow efficient
340 // resolution of object start addresses.
341 //
342 // ShenandoahCardCluster supports all of the services of
343 // DirectCardMarkRememberedSet, plus it supports register_object() and lookup_object().
344 // Note that we only need to register the start addresses of the object that
345 // overlays the first address of a card; we need to do this for every card.
346 // In other words, register_object() checks if the object crosses a card boundary,
347 // and updates the offset value for each card that the object crosses into.
348 // For objects that don't straddle cards, nothing needs to be done.
349 //
350 class ShenandoahCardCluster: public CHeapObj<mtGC> {
351
352 private:
353 ShenandoahDirectCardMarkRememberedSet* _rs;
354
355 public:
356 static const size_t CardsPerCluster = 64;
357
358 private:
359 typedef struct cross_map { uint8_t first; uint8_t last; } xmap;
360 typedef union crossing_info { uint16_t short_word; xmap offsets; } crossing_info;
361
362 // ObjectStartsInCardRegion bit is set within a crossing_info.offsets.start iff at least one object starts within
363 // a particular card region. We pack this bit into start byte under assumption that start byte is accessed less
364 // frequently than last byte. This is true when number of clean cards is greater than number of dirty cards.
365 static const uint8_t ObjectStartsInCardRegion = 0x80;
366 static const uint8_t FirstStartBits = 0x7f;
367
368 // Check that we have enough bits to store the largest possible offset into a card for an object start.
369 // The value for maximum card size is based on the constraints for GCCardSizeInBytes in gc_globals.hpp.
370 static const int MaxCardSize = NOT_LP64(512) LP64_ONLY(1024);
371 STATIC_ASSERT((MaxCardSize / HeapWordSize) - 1 <= FirstStartBits);
372
373 crossing_info* _object_starts;
374
375 public:
376 // If we're setting first_start, assume the card has an object.
377 inline void set_first_start(size_t card_index, uint8_t value) {
378 _object_starts[card_index].offsets.first = ObjectStartsInCardRegion | value;
379 }
380
381 inline void set_last_start(size_t card_index, uint8_t value) {
382 _object_starts[card_index].offsets.last = value;
383 }
384
385 inline void set_starts_object_bit(size_t card_index) {
386 _object_starts[card_index].offsets.first |= ObjectStartsInCardRegion;
387 }
388
389 inline void clear_starts_object_bit(size_t card_index) {
390 _object_starts[card_index].offsets.first &= ~ObjectStartsInCardRegion;
391 }
392
393 // Returns true iff an object is known to start within the card memory associated with card card_index.
394 inline bool starts_object(size_t card_index) const {
395 return (_object_starts[card_index].offsets.first & ObjectStartsInCardRegion) != 0;
396 }
397
398 inline void clear_objects_in_range(HeapWord* addr, size_t num_words) {
399 size_t card_index = _rs->card_index_for_addr(addr);
400 size_t last_card_index = _rs->card_index_for_addr(addr + num_words - 1);
401 while (card_index <= last_card_index)
402 _object_starts[card_index++].short_word = 0;
403 }
404
405 ShenandoahCardCluster(ShenandoahDirectCardMarkRememberedSet* rs) {
406 _rs = rs;
407 _object_starts = NEW_C_HEAP_ARRAY(crossing_info, rs->total_cards() + 1, mtGC); // the +1 is to account for card table guarding entry
408 for (size_t i = 0; i < rs->total_cards(); i++) {
409 _object_starts[i].short_word = 0;
410 }
411 }
412
413 ~ShenandoahCardCluster() {
414 FREE_C_HEAP_ARRAY(crossing_info, _object_starts);
415 _object_starts = nullptr;
416 }
417
418 // There is one entry within the object_starts array for each card entry.
419 //
420 // Suppose multiple garbage objects are coalesced during GC sweep
421 // into a single larger "free segment". As each two objects are
422 // coalesced together, the start information pertaining to the second
423 // object must be removed from the objects_starts array. If the
424 // second object had been the first object within card memory,
425 // the new first object is the object that follows that object if
426 // that starts within the same card memory, or NoObject if the
427 // following object starts within the following cluster. If the
428 // second object had been the last object in the card memory,
429 // replace this entry with the newly coalesced object if it starts
430 // within the same card memory, or with NoObject if it starts in a
431 // preceding card's memory.
432 //
433 // Suppose a large free segment is divided into a smaller free
434 // segment and a new object. The second part of the newly divided
435 // memory must be registered as a new object, overwriting at most
436 // one first_start and one last_start entry. Note that one of the
437 // newly divided two objects might be a new GCLAB.
438 //
439 // Suppose postprocessing of a GCLAB finds that the original GCLAB
440 // has been divided into N objects. Each of the N newly allocated
441 // objects will be registered, overwriting at most one first_start
442 // and one last_start entries.
443 //
444 // No object registration operations are linear in the length of
445 // the registered objects.
446 //
447 // Consider further the following observations regarding object
448 // registration costs:
449 //
450 // 1. The cost is paid once for each old-gen object (Except when
451 // an object is demoted and repromoted, in which case we would
452 // pay the cost again).
453 // 2. The cost can be deferred so that there is no urgency during
454 // mutator copy-on-first-access promotion. Background GC
455 // threads will update the object_starts array by post-
456 // processing the contents of retired PLAB buffers.
457 // 3. The bet is that these costs are paid relatively rarely
458 // because:
459 // a) Most objects die young and objects that die in young-gen
460 // memory never need to be registered with the object_starts
461 // array.
462 // b) Most objects that are promoted into old-gen memory live
463 // there without further relocation for a relatively long
464 // time, so we get a lot of benefit from each investment
465 // in registering an object.
466
467 public:
468
469 // The starting locations of objects contained within old-gen memory
470 // are registered as part of the remembered set implementation. This
471 // information is required when scanning dirty card regions that are
472 // spanned by objects beginning within preceding card regions. It
473 // is necessary to find the first and last objects that begin within
474 // this card region. Starting addresses of objects are required to
475 // find the object headers, and object headers provide information
476 // about which fields within the object hold addresses.
477 //
478 // The old-gen memory allocator invokes register_object() for any
479 // object that is allocated within old-gen memory. This identifies
480 // the starting addresses of objects that span boundaries between
481 // card regions.
482 //
483 // It is not necessary to invoke register_object at the very instant
484 // an object is allocated. It is only necessary to invoke it
485 // prior to the next start of a garbage collection concurrent mark
486 // or concurrent update-references phase. An "ideal" time to register
487 // objects is during post-processing of a GCLAB after the GCLAB is
488 // retired due to depletion of its memory.
489 //
490 // register_object() does not perform synchronization. In the case
491 // that multiple threads are registering objects whose starting
492 // addresses are within the same cluster, races between these
493 // threads may result in corruption of the object-start data
494 // structures. Parallel GC threads should avoid registering objects
495 // residing within the same cluster by adhering to the following
496 // coordination protocols:
497 //
498 // 1. Align thread-local GCLAB buffers with some TBD multiple of
499 // card clusters. The card cluster size is 32 KB. If the
500 // desired GCLAB size is 128 KB, align the buffer on a multiple
501 // of 4 card clusters.
502 // 2. Post-process the contents of GCLAB buffers to register the
503 // objects allocated therein. Allow one GC thread at a
504 // time to do the post-processing of each GCLAB.
505 // 3. Since only one GC thread at a time is registering objects
506 // belonging to a particular allocation buffer, no locking
507 // is performed when registering these objects.
508 // 4. Any remnant of unallocated memory within an expended GC
509 // allocation buffer is not returned to the old-gen allocation
510 // pool until after the GC allocation buffer has been post
511 // processed. Before any remnant memory is returned to the
512 // old-gen allocation pool, the GC thread that scanned this GC
513 // allocation buffer performs a write-commit memory barrier.
514 // 5. Background GC threads that perform tenuring of young-gen
515 // objects without a GCLAB use a CAS lock before registering
516 // each tenured object. The CAS lock assures both mutual
517 // exclusion and memory coherency/visibility. Note that an
518 // object tenured by a background GC thread will not overlap
519 // with any of the clusters that are receiving tenured objects
520 // by way of GCLAB buffers. Multiple independent GC threads may
521 // attempt to tenure objects into a shared cluster. This is why
522 // sychronization may be necessary. Consider the following
523 // scenarios:
524 //
525 // a) If two objects are tenured into the same card region, each
526 // registration may attempt to modify the first-start or
527 // last-start information associated with that card region.
528 // Furthermore, because the representations of first-start
529 // and last-start information within the object_starts array
530 // entry uses different bits of a shared uint_16 to represent
531 // each, it is necessary to lock the entire card entry
532 // before modifying either the first-start or last-start
533 // information within the entry.
534 // b) Suppose GC thread X promotes a tenured object into
535 // card region A and this tenured object spans into
536 // neighboring card region B. Suppose GC thread Y (not equal
537 // to X) promotes a tenured object into cluster B. GC thread X
538 // will update the object_starts information for card A. No
539 // synchronization is required.
540 // c) In summary, when background GC threads register objects
541 // newly tenured into old-gen memory, they must acquire a
542 // mutual exclusion lock on the card that holds the starting
543 // address of the newly tenured object. This can be achieved
544 // by using a CAS instruction to assure that the previous
545 // values of first-offset and last-offset have not been
546 // changed since the same thread inquired as to their most
547 // current values.
548 //
549 // One way to minimize the need for synchronization between
550 // background tenuring GC threads is for each tenuring GC thread
551 // to promote young-gen objects into distinct dedicated cluster
552 // ranges.
553 // 6. The object_starts information is only required during the
554 // starting of concurrent marking and concurrent evacuation
555 // phases of GC. Before we start either of these GC phases, the
556 // JVM enters a safe point and all GC threads perform
557 // commit-write barriers to assure that access to the
558 // object_starts information is coherent.
559
560
561 // Notes on synchronization of register_object():
562 //
563 // 1. For efficiency, there is no locking in the implementation of register_object()
564 // 2. Thus, it is required that users of this service assure that concurrent/parallel invocations of
565 // register_object() do pertain to the same card's memory range. See discussion below to understand
566 // the risks.
567 // 3. When allocating from a TLAB or GCLAB, the mutual exclusion can be guaranteed by assuring that each
568 // LAB's start and end are aligned on card memory boundaries.
569 // 4. Use the same lock that guarantees exclusivity when performing free-list allocation within heap regions.
570 //
571 // Register the newly allocated object while we're holding the global lock since there's no synchronization
572 // built in to the implementation of register_object(). There are potential races when multiple independent
573 // threads are allocating objects, some of which might span the same card region. For example, consider
574 // a card table's memory region within which three objects are being allocated by three different threads:
575 //
576 // objects being "concurrently" allocated:
577 // [-----a------][-----b-----][--------------c------------------]
578 // [---- card table memory range --------------]
579 //
580 // Before any objects are allocated, this card's memory range holds no objects. Note that:
581 // allocation of object a wants to set the has-object, first-start, and last-start attributes of the preceding card region.
582 // allocation of object b wants to set the has-object, first-start, and last-start attributes of this card region.
583 // allocation of object c also wants to set the has-object, first-start, and last-start attributes of this card region.
584 //
585 // The thread allocating b and the thread allocating c can "race" in various ways, resulting in confusion, such as last-start
586 // representing object b while first-start represents object c. This is why we need to require all register_object()
587 // invocations associated with objects that are allocated from "free lists" to provide their own mutual exclusion locking
588 // mechanism.
589
590 // Reset the starts_object() information to false for all cards in the range between from and to.
591 void reset_object_range(HeapWord* from, HeapWord* to);
592
593 // register_object() requires that the caller hold the heap lock
594 // before calling it.
595 void register_object(HeapWord* address);
596
597 // register_object_without_lock() does not require that the caller hold
598 // the heap lock before calling it, under the assumption that the
599 // caller has assured no other thread will endeavor to concurrently
600 // register objects that start within the same card's memory region
601 // as address.
602 void register_object_without_lock(HeapWord* address);
603
604 // During the reference updates phase of GC, we walk through each old-gen memory region that was
605 // not part of the collection set and we invalidate all unmarked objects. As part of this effort,
606 // we coalesce neighboring dead objects in order to make future remembered set scanning more
607 // efficient (since future remembered set scanning of any card region containing consecutive
608 // dead objects can skip over all of them at once by reading only a single dead object header
609 // instead of having to read the header of each of the coalesced dead objects.
610 //
611 // At some future time, we may implement a further optimization: satisfy future allocation requests
612 // by carving new objects out of the range of memory that represents the coalesced dead objects.
613 //
614 // Suppose we want to combine several dead objects into a single coalesced object. How does this
615 // impact our representation of crossing map information?
616 // 1. If the newly coalesced range is contained entirely within a card range, that card's last
617 // start entry either remains the same or it is changed to the start of the coalesced region.
618 // 2. For the card that holds the start of the coalesced object, it will not impact the first start
619 // but it may impact the last start.
620 // 3. For following cards spanned entirely by the newly coalesced object, it will change starts_object
621 // to false (and make first-start and last-start "undefined").
622 // 4. For a following card that is spanned patially by the newly coalesced object, it may change
623 // first-start value, but it will not change the last-start value.
624 //
625 // The range of addresses represented by the arguments to coalesce_objects() must represent a range
626 // of memory that was previously occupied exactly by one or more previously registered objects. For
627 // convenience, it is legal to invoke coalesce_objects() with arguments that span a single previously
628 // registered object.
629 //
630 // The role of coalesce_objects is to change the crossing map information associated with all of the coalesced
631 // objects.
632 void coalesce_objects(HeapWord* address, size_t length_in_words);
633
634 // The typical use case is going to look something like this:
635 // for each heapregion that comprises old-gen memory
636 // for each card number that corresponds to this heap region
637 // scan the objects contained therein if the card is dirty
638 // To avoid excessive lookups in a sparse array, the API queries
639 // the card number pertaining to a particular address and then uses the
640 // card number for subsequent information lookups and stores.
641
642 // If starts_object(card_index), this returns the word offset within this card
643 // memory at which the first object begins. If !starts_object(card_index), the
644 // result is a don't care value -- asserts in a debug build.
645 size_t get_first_start(size_t card_index) const;
646
647 // If starts_object(card_index), this returns the word offset within this card
648 // memory at which the last object begins. If !starts_object(card_index), the
649 // result is a don't care value.
650 size_t get_last_start(size_t card_index) const;
651
652
653 // Given a card_index, return the starting address of the first block in the heap
654 // that straddles into the card. If the card is co-initial with an object, then
655 // this would return the starting address of the heap that this card covers.
656 // Expects to be called for a card affiliated with the old generation in
657 // generational mode.
658 HeapWord* block_start(size_t card_index) const;
659 };
660
661 // ShenandoahScanRemembered is a concrete class representing the
662 // ability to scan the old-gen remembered set for references to
663 // objects residing in young-gen memory.
664 //
665 // Scanning normally begins with an invocation of numRegions and ends
666 // after all clusters of all regions have been scanned.
667 //
668 // Throughout the scanning effort, the number of regions does not
669 // change.
670 //
671 // Even though the regions that comprise old-gen memory are not
672 // necessarily contiguous, the abstraction represented by this class
673 // identifies each of the old-gen regions with an integer value
674 // in the range from 0 to (numRegions() - 1) inclusive.
675 //
676
677 class ShenandoahScanRemembered: public CHeapObj<mtGC> {
678
679 private:
680 ShenandoahDirectCardMarkRememberedSet* _rs;
681 ShenandoahCardCluster* _scc;
682
683 // Global card stats (cumulative)
684 HdrSeq _card_stats_scan_rs[MAX_CARD_STAT_TYPE];
685 HdrSeq _card_stats_update_refs[MAX_CARD_STAT_TYPE];
686 // Per worker card stats (multiplexed by phase)
687 HdrSeq** _card_stats;
688
689 // The types of card metrics that we gather
690 const char* _card_stats_name[MAX_CARD_STAT_TYPE] = {
691 "dirty_run", "clean_run",
692 "dirty_cards", "clean_cards",
693 "max_dirty_run", "max_clean_run",
694 "dirty_scan_objs",
695 "alternations"
696 };
697
698 // The statistics are collected and logged separately for
699 // card-scans for initial marking, and for updating refs.
700 const char* _card_stat_log_type[MAX_CARD_STAT_LOG_TYPE] = {
701 "Scan Remembered Set", "Update Refs"
702 };
703
704 int _card_stats_log_counter[2] = {0, 0};
705
706 public:
707 ShenandoahScanRemembered(ShenandoahDirectCardMarkRememberedSet* rs) {
708 _rs = rs;
709 _scc = new ShenandoahCardCluster(rs);
710
711 // We allocate ParallelGCThreads worth even though we usually only
712 // use up to ConcGCThreads, because degenerate collections may employ
713 // ParallelGCThreads for remembered set scanning.
714 if (ShenandoahEnableCardStats) {
715 _card_stats = NEW_C_HEAP_ARRAY(HdrSeq*, ParallelGCThreads, mtGC);
716 for (uint i = 0; i < ParallelGCThreads; i++) {
717 _card_stats[i] = new HdrSeq[MAX_CARD_STAT_TYPE];
718 }
719 } else {
720 _card_stats = nullptr;
721 }
722 }
723
724 ~ShenandoahScanRemembered() {
725 delete _scc;
726 if (ShenandoahEnableCardStats) {
727 for (uint i = 0; i < ParallelGCThreads; i++) {
728 delete _card_stats[i];
729 }
730 FREE_C_HEAP_ARRAY(HdrSeq*, _card_stats);
731 _card_stats = nullptr;
732 }
733 assert(_card_stats == nullptr, "Error");
734 }
735
736 HdrSeq* card_stats(uint worker_id) {
737 assert(worker_id < ParallelGCThreads, "Error");
738 assert(ShenandoahEnableCardStats == (_card_stats != nullptr), "Error");
739 return ShenandoahEnableCardStats ? _card_stats[worker_id] : nullptr;
740 }
741
742 HdrSeq* card_stats_for_phase(CardStatLogType t) {
743 switch (t) {
744 case CARD_STAT_SCAN_RS:
745 return _card_stats_scan_rs;
746 case CARD_STAT_UPDATE_REFS:
747 return _card_stats_update_refs;
748 default:
749 guarantee(false, "No such CardStatLogType");
750 }
751 return nullptr; // Quiet compiler
752 }
753
754 // Card index is zero-based relative to first spanned card region.
755 size_t card_index_for_addr(HeapWord* p);
756 HeapWord* addr_for_card_index(size_t card_index);
757 bool is_card_dirty(size_t card_index);
758 bool is_write_card_dirty(size_t card_index);
759 bool is_card_dirty(HeapWord* p);
760 bool is_write_card_dirty(HeapWord* p);
761 void mark_card_as_dirty(HeapWord* p);
762 void mark_range_as_dirty(HeapWord* p, size_t num_heap_words);
763 void mark_range_as_clean(HeapWord* p, size_t num_heap_words);
764
765 // This method is used to concurrently clean the "read" card table -
766 // currently, as part of the reset phase. Later on the pointers to the "read"
767 // and "write" card tables are swapped everywhere to enable the GC to
768 // concurrently operate on the "read" table while mutators effect changes on
769 // the "write" table.
770 void mark_read_table_as_clean();
771
772 // Swaps read and write card tables pointers in effect setting a clean card
773 // table for the next GC cycle.
774 void swap_card_tables() { _rs->swap_card_tables(); }
775
776 void merge_write_table(HeapWord* start, size_t word_count) { _rs->merge_write_table(start, word_count); }
777
778 size_t cluster_for_addr(HeapWord* addr);
779 HeapWord* addr_for_cluster(size_t cluster_no);
780
781 void reset_object_range(HeapWord* from, HeapWord* to);
782 void register_object(HeapWord* addr);
783 void register_object_without_lock(HeapWord* addr);
784 void coalesce_objects(HeapWord* addr, size_t length_in_words);
785
786 HeapWord* first_object_in_card(size_t card_index) {
787 if (_scc->starts_object(card_index)) {
788 return addr_for_card_index(card_index) + _scc->get_first_start(card_index);
789 } else {
790 return nullptr;
791 }
792 }
793
794 // Return true iff this object is "properly" registered.
795 bool verify_registration(HeapWord* address, ShenandoahMarkingContext* ctx);
796
797 // clear the cards to clean, and clear the object_starts info to no objects
798 void mark_range_as_empty(HeapWord* addr, size_t length_in_words);
799
800 // process_clusters() scans a portion of the remembered set
801 // for references from old gen into young. Several worker threads
802 // scan different portions of the remembered set by making parallel invocations
803 // of process_clusters() with each invocation scanning different
804 // "clusters" of the remembered set.
805 //
806 // An invocation of process_clusters() examines all of the
807 // intergenerational references spanned by `count` clusters starting
808 // with `first_cluster`. The `oops` argument is a worker-thread-local
809 // OopClosure that is applied to all "valid" references in the remembered set.
810 //
811 // A side-effect of executing process_clusters() is to update the remembered
812 // set entries (e.g. marking dirty cards clean if they no longer
813 // hold references to young-gen memory).
814 //
815 // An implementation of process_clusters() may choose to efficiently
816 // address more typical scenarios in the structure of remembered sets. E.g.
817 // in the generational setting, one might expect remembered sets to be very sparse
818 // (low mutation rates in the old generation leading to sparse dirty cards,
819 // each with very few intergenerational pointers). Specific implementations
820 // may choose to degrade gracefully as the sparsity assumption fails to hold,
821 // such as when there are sudden spikes in (premature) promotion or in the
822 // case of an underprovisioned, poorly-tuned, or poorly-shaped heap.
823 //
824 // At the start of a concurrent young generation marking cycle, we invoke process_clusters
825 // with ClosureType ShenandoahInitMarkRootsClosure.
826 //
827 // At the start of a concurrent evacuation phase, we invoke process_clusters with
828 // ClosureType ShenandoahEvacuateUpdateRootsClosure.
829
830 template <typename ClosureType>
831 void process_clusters(size_t first_cluster, size_t count, HeapWord* end_of_range, ClosureType* oops,
832 bool use_write_table, uint worker_id);
833
834 template <typename ClosureType>
835 void process_humongous_clusters(ShenandoahHeapRegion* r, size_t first_cluster, size_t count,
836 HeapWord* end_of_range, ClosureType* oops, bool use_write_table);
837
838 template <typename ClosureType>
839 void process_region_slice(ShenandoahHeapRegion* region, size_t offset, size_t clusters, HeapWord* end_of_range,
840 ClosureType* cl, bool use_write_table, uint worker_id);
841
842 // To Do:
843 // Create subclasses of ShenandoahInitMarkRootsClosure and
844 // ShenandoahEvacuateUpdateRootsClosure and any other closures
845 // that need to participate in remembered set scanning. Within the
846 // subclasses, add a (probably templated) instance variable that
847 // refers to the associated ShenandoahCardCluster object. Use this
848 // ShenandoahCardCluster instance to "enhance" the do_oops
849 // processing so that we can:
850 //
851 // 1. Avoid processing references that correspond to clean card
852 // regions, and
853 // 2. Set card status to CLEAN when the associated card region no
854 // longer holds inter-generatioanal references.
855 //
856 // To enable efficient implementation of these behaviors, we
857 // probably also want to add a few fields into the
858 // ShenandoahCardCluster object that allow us to precompute and
859 // remember the addresses at which card status is going to change
860 // from dirty to clean and clean to dirty. The do_oops
861 // implementations will want to update this value each time they
862 // cross one of these boundaries.
863 void roots_do(OopIterateClosure* cl);
864
865 // Log stats related to card/RS stats for given phase t
866 void log_card_stats(uint nworkers, CardStatLogType t) PRODUCT_RETURN;
867 private:
868 // Log stats for given worker id related into given summary card/RS stats
869 void log_worker_card_stats(uint worker_id, HdrSeq* sum_stats) PRODUCT_RETURN;
870
871 // Log given stats
872 void log_card_stats(HdrSeq* stats) PRODUCT_RETURN;
873
874 // Merge the stats from worked_id into the given summary stats, and clear the worker_id's stats.
875 void merge_worker_card_stats_cumulative(HdrSeq* worker_stats, HdrSeq* sum_stats) PRODUCT_RETURN;
876 };
877
878
879 // A ShenandoahRegionChunk represents a contiguous interval of a ShenandoahHeapRegion, typically representing
880 // work to be done by a worker thread.
881 struct ShenandoahRegionChunk {
882 ShenandoahHeapRegion* _r; // The region of which this represents a chunk
883 size_t _chunk_offset; // HeapWordSize offset
884 size_t _chunk_size; // HeapWordSize qty
885 };
886
887 // ShenandoahRegionChunkIterator divides the total remembered set scanning effort into ShenandoahRegionChunks
888 // that are assigned one at a time to worker threads. (Here, we use the terms `assignments` and `chunks`
889 // interchangeably.) Note that the effort required to scan a range of memory is not necessarily a linear
890 // function of the size of the range. Some memory ranges hold only a small number of live objects.
891 // Some ranges hold primarily primitive (non-pointer) data. We start with larger chunk sizes because larger chunks
892 // reduce coordination overhead. We expect that the GC worker threads that receive more difficult assignments
893 // will work longer on those chunks. Meanwhile, other worker threads will repeatedly accept and complete multiple
894 // easier chunks. As the total amount of work remaining to be completed decreases, we decrease the size of chunks
895 // given to individual threads. This reduces the likelihood of significant imbalance between worker thread assignments
896 // when there is less meaningful work to be performed by the remaining worker threads while they wait for
897 // worker threads with difficult assignments to finish, reducing the overall duration of the phase.
898
899 class ShenandoahRegionChunkIterator : public StackObj {
900 private:
901 // The largest chunk size is 4 MiB, measured in words. Otherwise, remembered set scanning may become too unbalanced.
902 // If the largest chunk size is too small, there is too much overhead sifting out assignments to individual worker threads.
903 static const size_t _maximum_chunk_size_words = (4 * 1024 * 1024) / HeapWordSize;
904 static const size_t _clusters_in_smallest_chunk = 4;
905
906 size_t _largest_chunk_size_words;
907
908 // smallest_chunk_size is 4 clusters. Each cluster spans 128 KiB.
909 // This is computed from CardTable::card_size_in_words() * ShenandoahCardCluster::CardsPerCluster;
910 static size_t smallest_chunk_size_words() {
911 return _clusters_in_smallest_chunk * CardTable::card_size_in_words() * ShenandoahCardCluster::CardsPerCluster;
912 }
913
914 // The total remembered set scanning effort is divided into chunks of work that are assigned to individual worker tasks.
915 // The chunks of assigned work are divided into groups, where the size of the typical group (_regular_group_size) is half the
916 // total number of regions. The first group may be larger than
917 // _regular_group_size in the case that the first group's chunk
918 // size is less than the region size. The last group may be larger
919 // than _regular_group_size because no group is allowed to
920 // have smaller assignments than _smallest_chunk_size, which is 128 KB.
921
922 // Under normal circumstances, no configuration needs more than _maximum_groups (default value of 16).
923 // The first group "effectively" processes chunks of size 1 MiB (or smaller for smaller region sizes).
924 // The last group processes chunks of size 128 KiB. There are four groups total.
925
926 // group[ 0] is 4 MiB chunk size (_maximum_chunk_size_words)
927 // group[ 1] is 2 MiB chunk size
928 // group[ 2] is 1 MiB chunk size
929 // group[ 3] is 512 KiB chunk size
930 // group[ 4] is 256 KiB chunk size
931 // group[ 5] is 128 KiB chunk size
932 // group[ 6] is 64 KiB chunk size
933 // group[ 7] is 32 KiB chunk size
934 // group[ 8] is 16 KiB chunk size
935 // group[ 9] is 8 KiB chunk size
936 // group[10] is 4 KiB chunk size
937 // Note: 4 KiB is smallest possible chunk_size, computed from:
938 // _clusters_in_smallest_chunk * MinimumCardSizeInWords * ShenandoahCardCluster::CardsPerCluster, which is
939 // 4 * 16 * 64 = 4096
940
941 // We set aside arrays to represent the maximum number of groups that may be required for any heap configuration
942 static const size_t _maximum_groups = 11;
943
944 const ShenandoahHeap* _heap;
945
946 const size_t _regular_group_size; // Number of chunks in each group
947 const size_t _first_group_chunk_size_b4_rebalance;
948 const size_t _num_groups; // Number of groups in this configuration
949 size_t _adjusted_num_groups; // Rebalancing may coalesce groups
950 const size_t _total_chunks;
951
952 shenandoah_padding(0);
953 volatile size_t _index;
954 shenandoah_padding(1);
955
956 size_t _region_index[_maximum_groups]; // The region index for the first region spanned by this group
957 size_t _group_offset[_maximum_groups]; // The offset at which group begins within first region spanned by this group
958 size_t _group_chunk_size[_maximum_groups]; // The size of each chunk within this group
959 size_t _group_entries[_maximum_groups]; // Total chunks spanned by this group and the ones before it.
960
961 // No implicit copying: iterators should be passed by reference to capture the state
962 NONCOPYABLE(ShenandoahRegionChunkIterator);
963
964 // Makes use of _heap.
965 size_t calc_regular_group_size();
966
967 // Makes use of _regular_group_size, which must be initialized before call.
968 size_t calc_first_group_chunk_size_b4_rebalance();
969
970 // Makes use of _regular_group_size and _first_group_chunk_size_b4_rebalance, both of which must be initialized before call.
971 size_t calc_num_groups();
972
973 // Makes use of _regular_group_size, _first_group_chunk_size_b4_rebalance, which must be initialized before call.
974 size_t calc_total_chunks();
975
976 public:
977 ShenandoahRegionChunkIterator(size_t worker_count);
978 ShenandoahRegionChunkIterator(ShenandoahHeap* heap, size_t worker_count);
979
980 // Reset iterator to default state
981 void reset();
982
983 // Fills in assignment with next chunk of work and returns true iff there is more work.
984 // Otherwise, returns false. This is multi-thread-safe.
985 inline bool next(struct ShenandoahRegionChunk* assignment);
986
987 // This is *not* MT safe. However, in the absence of multithreaded access, it
988 // can be used to determine if there is more work to do.
989 inline bool has_next() const;
990 };
991
992
993 class ShenandoahScanRememberedTask : public WorkerTask {
994 private:
995 ShenandoahObjToScanQueueSet* _queue_set;
996 ShenandoahObjToScanQueueSet* _old_queue_set;
997 ShenandoahReferenceProcessor* _rp;
998 ShenandoahRegionChunkIterator* _work_list;
999 bool _is_concurrent;
1000
1001 public:
1002 ShenandoahScanRememberedTask(ShenandoahObjToScanQueueSet* queue_set,
1003 ShenandoahObjToScanQueueSet* old_queue_set,
1004 ShenandoahReferenceProcessor* rp,
1005 ShenandoahRegionChunkIterator* work_list,
1006 bool is_concurrent);
1007
1008 void work(uint worker_id);
1009 void do_work(uint worker_id);
1010 };
1011
1012 // After Full GC is done, reconstruct the remembered set by iterating over OLD regions,
1013 // registering all objects between bottom() and top(), and dirtying the cards containing
1014 // cross-generational pointers.
1015 class ShenandoahReconstructRememberedSetTask : public WorkerTask {
1016 private:
1017 ShenandoahRegionIterator* _regions;
1018
1019 public:
1020 explicit ShenandoahReconstructRememberedSetTask(ShenandoahRegionIterator* regions);
1021
1022 void work(uint worker_id) override;
1023 };
1024
1025 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP