1 /*
2 * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef SHARE_CLASSFILE_COMPACTHASHTABLE_HPP
26 #define SHARE_CLASSFILE_COMPACTHASHTABLE_HPP
27
28 #include "cds/aotCompressedPointers.hpp"
29 #include "cds/cds_globals.hpp"
30 #include "oops/array.hpp"
31 #include "oops/symbol.hpp"
32 #include "runtime/globals.hpp"
33 #include "utilities/growableArray.hpp"
34
35
36 template <
37 typename K,
38 typename V,
39 V (*DECODE)(address base_address, u4 encoded_value),
40 bool (*EQUALS)(V value, K key, int len)
41 >
42 class CompactHashtable;
43 class NumberSeq;
44 class SimpleCompactHashtable;
45 class SerializeClosure;
46
47 // Stats for symbol tables in the CDS archive
48 class CompactHashtableStats {
49 public:
50 int hashentry_count;
51 int hashentry_bytes;
52 int bucket_count;
53 int bucket_bytes;
54
55 CompactHashtableStats() :
56 hashentry_count(0), hashentry_bytes(0),
57 bucket_count(0), bucket_bytes(0) {}
58 };
59
60 #if INCLUDE_CDS
61 /////////////////////////////////////////////////////////////////////////
62 //
63 // The compact hash table writer. Used at dump time for writing out
64 // the compact table to the shared archive.
65 //
66 // At dump time, the CompactHashtableWriter obtains all entries from
67 // a table (the table could be in any form of a collection of <hash, encoded_value> pair)
68 // and adds them to a new temporary hash table (_buckets). The hash
69 // table size (number of buckets) is calculated using
70 // '(num_entries + bucket_size - 1) / bucket_size'. The default bucket
71 // size is 4 and can be changed by -XX:SharedSymbolTableBucketSize option.
72 // 4 is chosen because it produces smaller sized bucket on average for
73 // faster lookup. It also has relatively small number of empty buckets and
74 // good distribution of the entries.
75 //
76 // We use a simple hash function (hash % num_bucket) for the table.
77 // The new table is compacted when written out. Please see comments
78 // above the CompactHashtable class for the table layout detail. The bucket
79 // offsets are written to the archive as part of the compact table. The
80 // bucket offset is encoded in the low 30-bit (0-29) and the bucket type
81 // (regular or value_only) are encoded in bit[31, 30]. For buckets with more
82 // than one entry, both hash and encoded_value are written to the
83 // table. For buckets with only one entry, only the encoded_value is written
84 // to the table and the buckets are tagged as value_only in their type bits.
85 // Buckets without entry are skipped from the table. Their offsets are
86 // still written out for faster lookup.
87 //
88 class CompactHashtableWriter: public StackObj {
89 public:
90 class Entry {
91 unsigned int _hash;
92 u4 _encoded_value;
93
94 public:
95 Entry() {}
96 Entry(unsigned int hash, u4 encoded_value) : _hash(hash), _encoded_value(encoded_value) {}
97
98 u4 encoded_value() {
99 return _encoded_value;
100 }
101 unsigned int hash() {
102 return _hash;
103 }
104
105 bool operator==(const CompactHashtableWriter::Entry& other) {
106 return (_encoded_value == other._encoded_value && _hash == other._hash);
107 }
108 }; // class CompactHashtableWriter::Entry
109
110 private:
111 int _num_entries_written;
112 int _num_buckets;
113 int _num_empty_buckets;
114 int _num_value_only_buckets;
115 int _num_other_buckets;
116 GrowableArray<Entry>** _buckets;
117 CompactHashtableStats* _stats;
118 u4* _compact_buckets;
119 size_t _num_compact_buckets;
120 u4* _compact_entries;
121 size_t _num_compact_entries;
122
123 void compact_buckets_set(u4 index, u4 value);
124 void compact_entries_set(u4 index, u4 value);
125
126 public:
127 // This is called at dump-time only
128 CompactHashtableWriter(int num_entries, CompactHashtableStats* stats);
129 ~CompactHashtableWriter();
130
131 void add(unsigned int hash, u4 encoded_value);
132 void add(unsigned int hash, AOTCompressedPointers::narrowPtr encoded_value) {
133 add(hash, cast_to_u4(encoded_value));
134 }
135 void dump(SimpleCompactHashtable *cht, const char* table_name);
136
137 private:
138 void allocate_table();
139 void dump_table(NumberSeq* summary);
140 static int calculate_num_buckets(int num_entries) {
141 int num_buckets = num_entries / SharedSymbolTableBucketSize;
142 // calculation of num_buckets can result in zero buckets, we need at least one
143 return (num_buckets < 1) ? 1 : num_buckets;
144 }
145 };
146 #endif // INCLUDE_CDS
147
148 #define REGULAR_BUCKET_TYPE 0
149 #define VALUE_ONLY_BUCKET_TYPE 1
150 #define TABLEEND_BUCKET_TYPE 3
151 #define BUCKET_OFFSET_MASK 0x3FFFFFFF
152 #define BUCKET_OFFSET(info) ((info) & BUCKET_OFFSET_MASK)
153 #define BUCKET_TYPE_SHIFT 30
154 #define BUCKET_TYPE(info) (((info) & ~BUCKET_OFFSET_MASK) >> BUCKET_TYPE_SHIFT)
155 #define BUCKET_INFO(offset, type) (((type) << BUCKET_TYPE_SHIFT) | ((offset) & BUCKET_OFFSET_MASK))
156
157 /////////////////////////////////////////////////////////////////////////////
158 //
159 // CompactHashtable is used to store the CDS archive's tables.
160 // A table could be in any form of a collection of <hash, encoded_value> pair.
161 //
162 // Because these tables are read-only (no entries can be added/deleted) at run-time
163 // and tend to have large number of entries, we try to minimize the footprint
164 // cost per entry.
165 //
166 // The CompactHashtable is split into two arrays
167 //
168 // u4 buckets[num_buckets+1]; // bit[31,30]: type; bit[29-0]: offset
169 // u4 entries[<variable size>]
170 //
171 // The size of buckets[] is 'num_buckets + 1'. Each entry of
172 // buckets[] is a 32-bit encoding of the bucket type and bucket offset,
173 // with the type in the left-most 2-bit and offset in the remaining 30-bit.
174 //
175 // There are three types of buckets: regular, value_only, and table_end.
176 // . The regular buckets have '00' in their highest 2-bit.
177 // . The value_only buckets have '01' in their highest 2-bit.
178 // . There is only a single table_end bucket that marks the end of buckets[].
179 // It has '11' in its highest 2-bit.
180 //
181 // For regular buckets, each entry is 8 bytes in the entries[]:
182 // u4 hash; // entry hash
183 // u4 encoded_value; // A 32-bit encoding of the template type V. The template parameter DECODE
184 // // converts this to type V. Many CompactHashtables encode a pointer as a 32-bit offset, where
185 // // V entry = (V)(base_address + offset)
186 // // see StringTable, SymbolTable and AdapterHandlerLibrary for examples
187 //
188 // For value_only buckets, each entry has only the 4-byte 'encoded_value' in the entries[].
189 //
190 // The single table_end bucket has no corresponding entry.
191 //
192 // The number of entries in bucket <i> can be calculated like this:
193 // my_offset = _buckets[i] & 0x3fffffff; // mask off top 2-bit
194 // next_offset = _buckets[i+1] & 0x3fffffff
195 // For REGULAR_BUCKET_TYPE
196 // num_entries = (next_offset - my_offset) / 8;
197 // For VALUE_ONLY_BUCKET_TYPE
198 // num_entries = (next_offset - my_offset) / 4;
199 //
200 // If bucket <i> is empty, we have my_offset == next_offset. Empty buckets are
201 // always encoded as regular buckets.
202 //
203 // In the following example:
204 // - Bucket #0 is a REGULAR_BUCKET_TYPE with two entries
205 // - Bucket #1 is a VALUE_ONLY_BUCKET_TYPE with one entry.
206 // - Bucket #2 is a REGULAR_BUCKET_TYPE with zero entries.
207 //
208 // buckets[0, 4, 5(empty), 5, ...., N(table_end)]
209 // | | | | |
210 // | | +---+-----+ |
211 // | | | |
212 // | +----+ + |
213 // v v v v
214 // entries[H,O,H,O,O,H,O,H,O........]
215 //
216 // See CompactHashtable::lookup() for how the table is searched at runtime.
217 // See CompactHashtableWriter::dump() for how the table is written at CDS
218 // dump time.
219 //
220 class SimpleCompactHashtable {
221 protected:
222 address _base_address;
223 u4 _bucket_count;
224 u4 _entry_count;
225 u4* _buckets;
226 u4* _entries;
227
228 public:
229 SimpleCompactHashtable() :
230 _base_address(nullptr),
231 _bucket_count(0),
232 _entry_count(0),
233 _buckets(nullptr),
234 _entries(nullptr)
235 {}
236
237 void reset() {
238 _base_address = nullptr;
239 _bucket_count = 0;
240 _entry_count = 0;
241 _buckets = nullptr;
242 _entries = nullptr;
243 }
244
245 void init(address base_address, u4 entry_count, u4 bucket_count, u4* buckets, u4* entries);
246
247 // Read/Write the table's header from/to the CDS archive
248 void serialize_header(SerializeClosure* soc) NOT_CDS_RETURN;
249
250 inline bool empty() const {
251 return (_entry_count == 0);
252 }
253
254 inline size_t entry_count() const {
255 return _entry_count;
256 }
257 };
258
259 template <
260 typename K,
261 typename V,
262 V (*DECODE)(address base_address, u4 encoded_value),
263 bool (*EQUALS)(V value, K key, int len)
264 >
265 class CompactHashtable : public SimpleCompactHashtable {
266
267 V decode(u4 encoded_value) const {
268 return DECODE(_base_address, encoded_value);
269 }
270
271 public:
272 // Lookup a value V from the compact table using key K
273 inline V lookup(K key, unsigned int hash, int len) const {
274 if (_entry_count > 0) {
275 int index = hash % _bucket_count;
276 u4 bucket_info = _buckets[index];
277 u4 bucket_offset = BUCKET_OFFSET(bucket_info);
278 int bucket_type = BUCKET_TYPE(bucket_info);
279 u4* entry = _entries + bucket_offset;
280
281 if (bucket_type == VALUE_ONLY_BUCKET_TYPE) {
282 V value = decode(entry[0]);
283 if (EQUALS(value, key, len)) {
284 return value;
285 }
286 } else {
287 // This is a regular bucket, which has more than one
288 // entries. Each entry is a (hash, value) pair.
289 // Seek until the end of the bucket.
290 u4* entry_max = _entries + BUCKET_OFFSET(_buckets[index + 1]);
291 while (entry < entry_max) {
292 unsigned int h = (unsigned int)(entry[0]);
293 if (h == hash) {
294 V value = decode(entry[1]);
295 if (EQUALS(value, key, len)) {
296 return value;
297 }
298 }
299 entry += 2;
300 }
301 }
302 }
303 return nullptr;
304 }
305
306 // Iterate through the values in the table, stopping when do_value() returns false.
307 template <class ITER>
308 inline void iterate(ITER* iter) const { iterate([&](V v) { iter->do_value(v); }); }
309
310 template<typename Function>
311 inline void iterate(const Function& function) const { // lambda enabled API
312 iterate(const_cast<Function&>(function));
313 }
314
315 // Iterate through the values in the table, stopping when the lambda returns false.
316 template<typename Function>
317 inline void iterate(Function& function) const { // lambda enabled API
318 for (u4 i = 0; i < _bucket_count; i++) {
319 u4 bucket_info = _buckets[i];
320 u4 bucket_offset = BUCKET_OFFSET(bucket_info);
321 int bucket_type = BUCKET_TYPE(bucket_info);
322 u4* entry = _entries + bucket_offset;
323
324 if (bucket_type == VALUE_ONLY_BUCKET_TYPE) {
325 if (!function(decode(entry[0]))) {
326 return;
327 }
328 } else {
329 u4* entry_max = _entries + BUCKET_OFFSET(_buckets[i + 1]);
330 while (entry < entry_max) {
331 if (!function(decode(entry[1]))) {
332 return;
333 }
334 entry += 2;
335 }
336 }
337 }
338 }
339
340 // Unconditionally iterate through all the values in the table
341 template <class ITER>
342 inline void iterate_all(ITER* iter) const { iterate_all([&](V v) { iter->do_value(v); }); }
343
344 // Unconditionally iterate through all the values in the table using lambda
345 template<typename Function>
346 void iterate_all(Function function) const { // lambda enabled API
347 auto wrapper = [&] (V v) {
348 function(v);
349 return true;
350 };
351 iterate(wrapper);
352 }
353
354 void print_table_statistics(outputStream* st, const char* name) {
355 st->print_cr("%s statistics:", name);
356 int total_entries = 0;
357 int max_bucket = 0;
358 for (u4 i = 0; i < _bucket_count; i++) {
359 u4 bucket_info = _buckets[i];
360 int bucket_type = BUCKET_TYPE(bucket_info);
361 int bucket_size;
362
363 if (bucket_type == VALUE_ONLY_BUCKET_TYPE) {
364 bucket_size = 1;
365 } else {
366 bucket_size = (BUCKET_OFFSET(_buckets[i + 1]) - BUCKET_OFFSET(bucket_info)) / 2;
367 }
368 total_entries += bucket_size;
369 if (max_bucket < bucket_size) {
370 max_bucket = bucket_size;
371 }
372 }
373 st->print_cr("Number of buckets : %9d", _bucket_count);
374 st->print_cr("Number of entries : %9d", total_entries);
375 st->print_cr("Maximum bucket size : %9d", max_bucket);
376 }
377 };
378
379 ////////////////////////////////////////////////////////////////////////
380 //
381 // OffsetCompactHashtable -- This is used to store many types of objects
382 // in the CDS archive. On 64-bit platforms, we save space by using a 32-bit
383 // narrowPtr from the CDS base address.
384
385 template <typename V>
386 inline V read_value_from_compact_hashtable(address base_address, u4 narrowp) {
387 return AOTCompressedPointers::decode_not_null<V>(cast_from_u4(narrowp), base_address);
388 }
389
390 template <
391 typename K,
392 typename V,
393 bool (*EQUALS)(V value, K key, int len)
394 >
395 class OffsetCompactHashtable : public CompactHashtable<
396 K, V, read_value_from_compact_hashtable<V>, EQUALS> {
397 };
398
399
400 ////////////////////////////////////////////////////////////////////////
401 //
402 // Read/Write the contents of a hashtable textual dump (created by
403 // SymbolTable::dump and StringTable::dump).
404 // Because the dump file may be big (hundred of MB in extreme cases),
405 // we use mmap for fast access when reading it.
406 //
407 class HashtableTextDump {
408 int _fd;
409 const char* _base;
410 const char* _p;
411 const char* _end;
412 const char* _filename;
413 size_t _size;
414 int _prefix_type;
415 int _line_no;
416 public:
417 HashtableTextDump(const char* filename);
418 ~HashtableTextDump();
419
420 enum {
421 SymbolPrefix = 1 << 0,
422 StringPrefix = 1 << 1,
423 Unknown = 1 << 2
424 };
425
426 void quit(const char* err, const char* msg);
427
428 inline int remain() {
429 return (int)(_end - _p);
430 }
431 int last_line_no() {
432 return _line_no - 1;
433 }
434
435 void corrupted(const char *p, const char *msg);
436
437 inline void corrupted_if(bool cond, const char *msg) {
438 if (cond) {
439 corrupted(_p, msg);
440 }
441 }
442
443 bool skip_newline();
444 int skip(char must_be_char);
445 void skip_past(char c);
446 void check_version(const char* ver);
447
448 inline void get_num(char delim, int *num) {
449 const char* p = _p;
450 const char* end = _end;
451 u8 n = 0;
452
453 while (p < end) {
454 char c = *p++;
455 if ('0' <= c && c <= '9') {
456 n = n * 10 + (c - '0');
457 if (n > (u8)INT_MAX) {
458 corrupted(_p, "Num overflow");
459 }
460 } else if (c == delim) {
461 _p = p;
462 *num = (int)n;
463 return;
464 } else {
465 // Not [0-9], not 'delim'
466 corrupted(_p, "Unrecognized format");;
467 }
468 }
469
470 corrupted(_end, "Incorrect format");
471 ShouldNotReachHere();
472 }
473
474 void scan_prefix_type();
475 int scan_prefix(int* utf8_length);
476 int scan_string_prefix();
477 int scan_symbol_prefix();
478
479 int unescape(const char* from, const char* end, int count);
480 void get_utf8(char* utf8_buffer, int utf8_length);
481 static void put_utf8(outputStream* st, const char* utf8_string, size_t utf8_length);
482 };
483
484 #endif // SHARE_CLASSFILE_COMPACTHASHTABLE_HPP