1 /*
  2  * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #ifndef SHARE_CLASSFILE_COMPACTHASHTABLE_HPP
 26 #define SHARE_CLASSFILE_COMPACTHASHTABLE_HPP
 27 
 28 #include "cds/aotCompressedPointers.hpp"
 29 #include "cds/cds_globals.hpp"
 30 #include "oops/array.hpp"
 31 #include "oops/symbol.hpp"
 32 #include "runtime/globals.hpp"
 33 #include "utilities/growableArray.hpp"
 34 
 35 
 36 template <
 37   typename K,
 38   typename V,
 39   V (*DECODE)(address base_address, u4 encoded_value),
 40   bool (*EQUALS)(V value, K key, int len)
 41   >
 42 class CompactHashtable;
 43 class NumberSeq;
 44 class SimpleCompactHashtable;
 45 class SerializeClosure;
 46 
 47 // Stats for symbol tables in the CDS archive
 48 class CompactHashtableStats {
 49 public:
 50   int hashentry_count;
 51   int hashentry_bytes;
 52   int bucket_count;
 53   int bucket_bytes;
 54 
 55   CompactHashtableStats() :
 56     hashentry_count(0), hashentry_bytes(0),
 57     bucket_count(0), bucket_bytes(0) {}
 58 };
 59 
 60 #if INCLUDE_CDS
 61 /////////////////////////////////////////////////////////////////////////
 62 //
 63 // The compact hash table writer. Used at dump time for writing out
 64 // the compact table to the shared archive.
 65 //
 66 // At dump time, the CompactHashtableWriter obtains all entries from
 67 // a table (the table could be in any form of a collection of <hash, encoded_value> pair)
 68 // and adds them to a new temporary hash table (_buckets). The hash
 69 // table size (number of buckets) is calculated using
 70 // '(num_entries + bucket_size - 1) / bucket_size'. The default bucket
 71 // size is 4 and can be changed by -XX:SharedSymbolTableBucketSize option.
 72 // 4 is chosen because it produces smaller sized bucket on average for
 73 // faster lookup. It also has relatively small number of empty buckets and
 74 // good distribution of the entries.
 75 //
 76 // We use a simple hash function (hash % num_bucket) for the table.
 77 // The new table is compacted when written out. Please see comments
 78 // above the CompactHashtable class for the table layout detail. The bucket
 79 // offsets are written to the archive as part of the compact table. The
 80 // bucket offset is encoded in the low 30-bit (0-29) and the bucket type
 81 // (regular or value_only) are encoded in bit[31, 30]. For buckets with more
 82 // than one entry, both hash and encoded_value are written to the
 83 // table. For buckets with only one entry, only the encoded_value is written
 84 // to the table and the buckets are tagged as value_only in their type bits.
 85 // Buckets without entry are skipped from the table. Their offsets are
 86 // still written out for faster lookup.
 87 //
 88 class CompactHashtableWriter: public StackObj {
 89 public:
 90   class Entry {
 91     unsigned int _hash;
 92     u4 _encoded_value;
 93 
 94   public:
 95     Entry() {}
 96     Entry(unsigned int hash, u4 encoded_value) : _hash(hash), _encoded_value(encoded_value) {}
 97 
 98     u4 encoded_value() {
 99       return _encoded_value;
100     }
101     unsigned int hash() {
102       return _hash;
103     }
104 
105     bool operator==(const CompactHashtableWriter::Entry& other) {
106       return (_encoded_value == other._encoded_value && _hash == other._hash);
107     }
108   }; // class CompactHashtableWriter::Entry
109 
110 private:
111   int _num_entries_written;
112   int _num_buckets;
113   int _num_empty_buckets;
114   int _num_value_only_buckets;
115   int _num_other_buckets;
116   GrowableArray<Entry>** _buckets;
117   CompactHashtableStats* _stats;
118   u4* _compact_buckets;
119   size_t _num_compact_buckets;
120   u4* _compact_entries;
121   size_t _num_compact_entries;
122 
123   void compact_buckets_set(u4 index, u4 value);
124   void compact_entries_set(u4 index, u4 value);
125 
126 public:
127   // This is called at dump-time only
128   CompactHashtableWriter(int num_entries, CompactHashtableStats* stats);
129   ~CompactHashtableWriter();
130 
131   void add(unsigned int hash, u4 encoded_value);
132   void add(unsigned int hash, AOTCompressedPointers::narrowPtr encoded_value) {
133     add(hash, cast_to_u4(encoded_value));
134   }
135   void dump(SimpleCompactHashtable *cht, const char* table_name);
136 
137 private:
138   void allocate_table();
139   void dump_table(NumberSeq* summary);
140   static int calculate_num_buckets(int num_entries) {
141     int num_buckets = num_entries / SharedSymbolTableBucketSize;
142     // calculation of num_buckets can result in zero buckets, we need at least one
143     return (num_buckets < 1) ? 1 : num_buckets;
144   }
145 };
146 #endif // INCLUDE_CDS
147 
148 #define REGULAR_BUCKET_TYPE       0
149 #define VALUE_ONLY_BUCKET_TYPE    1
150 #define TABLEEND_BUCKET_TYPE      3
151 #define BUCKET_OFFSET_MASK        0x3FFFFFFF
152 #define BUCKET_OFFSET(info)       ((info) & BUCKET_OFFSET_MASK)
153 #define BUCKET_TYPE_SHIFT         30
154 #define BUCKET_TYPE(info)         (((info) & ~BUCKET_OFFSET_MASK) >> BUCKET_TYPE_SHIFT)
155 #define BUCKET_INFO(offset, type) (((type) << BUCKET_TYPE_SHIFT) | ((offset) & BUCKET_OFFSET_MASK))
156 
157 /////////////////////////////////////////////////////////////////////////////
158 //
159 // CompactHashtable is used to store the CDS archive's tables.
160 // A table could be in any form of a collection of <hash, encoded_value> pair.
161 //
162 // Because these tables are read-only (no entries can be added/deleted) at run-time
163 // and tend to have large number of entries, we try to minimize the footprint
164 // cost per entry.
165 //
166 // The CompactHashtable is split into two arrays
167 //
168 //   u4 buckets[num_buckets+1]; // bit[31,30]: type; bit[29-0]: offset
169 //   u4 entries[<variable size>]
170 //
171 // The size of buckets[] is 'num_buckets + 1'. Each entry of
172 // buckets[] is a 32-bit encoding of the bucket type and bucket offset,
173 // with the type in the left-most 2-bit and offset in the remaining 30-bit.
174 //
175 // There are three types of buckets: regular, value_only, and table_end.
176 //  . The regular buckets have '00' in their highest 2-bit.
177 //  . The value_only buckets have '01' in their highest 2-bit.
178 //  . There is only a single table_end bucket that marks the end of buckets[].
179 //    It has '11' in its highest 2-bit.
180 //
181 // For regular buckets, each entry is 8 bytes in the entries[]:
182 //   u4 hash;          // entry hash
183 //   u4 encoded_value; // A 32-bit encoding of the template type V. The template parameter DECODE
184 //                     // converts this to type V. Many CompactHashtables encode a pointer as a 32-bit offset, where
185 //                     //   V entry = (V)(base_address + offset)
186 //                     // see StringTable, SymbolTable and AdapterHandlerLibrary for examples
187 //
188 // For value_only buckets, each entry has only the 4-byte 'encoded_value' in the entries[].
189 //
190 // The single table_end bucket has no corresponding entry.
191 //
192 // The number of entries in bucket <i> can be calculated like this:
193 //      my_offset   = _buckets[i]   & 0x3fffffff; // mask off top 2-bit
194 //      next_offset = _buckets[i+1] & 0x3fffffff
195 //  For REGULAR_BUCKET_TYPE
196 //      num_entries = (next_offset - my_offset) / 8;
197 //  For VALUE_ONLY_BUCKET_TYPE
198 //      num_entries = (next_offset - my_offset) / 4;
199 //
200 // If bucket <i> is empty, we have my_offset == next_offset. Empty buckets are
201 // always encoded as regular buckets.
202 //
203 // In the following example:
204 //   - Bucket #0 is a REGULAR_BUCKET_TYPE with two entries
205 //   - Bucket #1 is a VALUE_ONLY_BUCKET_TYPE with one entry.
206 //   - Bucket #2 is a REGULAR_BUCKET_TYPE with zero entries.
207 //
208 // buckets[0, 4, 5(empty), 5, ...., N(table_end)]
209 //         |  |  |         |        |
210 //         |  |  +---+-----+        |
211 //         |  |      |              |
212 //         |  +----+ +              |
213 //         v       v v              v
214 // entries[H,O,H,O,O,H,O,H,O........]
215 //
216 // See CompactHashtable::lookup() for how the table is searched at runtime.
217 // See CompactHashtableWriter::dump() for how the table is written at CDS
218 // dump time.
219 //
220 class SimpleCompactHashtable {
221 protected:
222   address  _base_address;
223   u4  _bucket_count;
224   u4  _entry_count;
225   u4* _buckets;
226   u4* _entries;
227 
228 public:
229   SimpleCompactHashtable() :
230     _base_address(nullptr),
231     _bucket_count(0),
232     _entry_count(0),
233     _buckets(nullptr),
234     _entries(nullptr)
235   {}
236 
237   void reset() {
238     _base_address = nullptr;
239     _bucket_count = 0;
240     _entry_count = 0;
241     _buckets = nullptr;
242     _entries = nullptr;
243   }
244 
245   void init(address base_address, u4 entry_count, u4 bucket_count, u4* buckets, u4* entries);
246 
247   // Read/Write the table's header from/to the CDS archive
248   void serialize_header(SerializeClosure* soc) NOT_CDS_RETURN;
249 
250   inline bool empty() const {
251     return (_entry_count == 0);
252   }
253 
254   inline size_t entry_count() const {
255     return _entry_count;
256   }
257 };
258 
259 template <
260   typename K,
261   typename V,
262   V (*DECODE)(address base_address, u4 encoded_value),
263   bool (*EQUALS)(V value, K key, int len)
264   >
265 class CompactHashtable : public SimpleCompactHashtable {
266 
267   V decode(u4 encoded_value) const {
268     return DECODE(_base_address, encoded_value);
269   }
270 
271 public:
272   // Lookup a value V from the compact table using key K
273   inline V lookup(K key, unsigned int hash, int len) const {
274     if (_entry_count > 0) {
275       int index = hash % _bucket_count;
276       u4 bucket_info = _buckets[index];
277       u4 bucket_offset = BUCKET_OFFSET(bucket_info);
278       int bucket_type = BUCKET_TYPE(bucket_info);
279       u4* entry = _entries + bucket_offset;
280 
281       if (bucket_type == VALUE_ONLY_BUCKET_TYPE) {
282         V value = decode(entry[0]);
283         if (EQUALS(value, key, len)) {
284           return value;
285         }
286       } else {
287         // This is a regular bucket, which has more than one
288         // entries. Each entry is a (hash, value) pair.
289         // Seek until the end of the bucket.
290         u4* entry_max = _entries + BUCKET_OFFSET(_buckets[index + 1]);
291         while (entry < entry_max) {
292           unsigned int h = (unsigned int)(entry[0]);
293           if (h == hash) {
294             V value = decode(entry[1]);
295             if (EQUALS(value, key, len)) {
296               return value;
297             }
298           }
299           entry += 2;
300         }
301       }
302     }
303     return nullptr;
304   }
305 
306   // Iterate through the values in the table, stopping when do_value() returns false.
307   template <class ITER>
308   inline void iterate(ITER* iter) const { iterate([&](V v) { iter->do_value(v); }); }
309 
310   template<typename Function>
311   inline void iterate(const Function& function) const { // lambda enabled API
312     iterate(const_cast<Function&>(function));
313   }
314 
315   // Iterate through the values in the table, stopping when the lambda returns false.
316   template<typename Function>
317   inline void iterate(Function& function) const { // lambda enabled API
318     for (u4 i = 0; i < _bucket_count; i++) {
319       u4 bucket_info = _buckets[i];
320       u4 bucket_offset = BUCKET_OFFSET(bucket_info);
321       int bucket_type = BUCKET_TYPE(bucket_info);
322       u4* entry = _entries + bucket_offset;
323 
324       if (bucket_type == VALUE_ONLY_BUCKET_TYPE) {
325         if (!function(decode(entry[0]))) {
326           return;
327         }
328       } else {
329         u4* entry_max = _entries + BUCKET_OFFSET(_buckets[i + 1]);
330         while (entry < entry_max) {
331           if (!function(decode(entry[1]))) {
332             return;
333           }
334           entry += 2;
335         }
336       }
337     }
338   }
339 
340   // Unconditionally iterate through all the values in the table
341   template <class ITER>
342   inline void iterate_all(ITER* iter) const { iterate_all([&](V v) { iter->do_value(v); }); }
343 
344   // Unconditionally iterate through all the values in the table using lambda
345   template<typename Function>
346   void iterate_all(Function function) const { // lambda enabled API
347     auto wrapper = [&] (V v) {
348       function(v);
349       return true;
350     };
351     iterate(wrapper);
352   }
353 
354   void print_table_statistics(outputStream* st, const char* name) {
355     st->print_cr("%s statistics:", name);
356     int total_entries = 0;
357     int max_bucket = 0;
358     for (u4 i = 0; i < _bucket_count; i++) {
359       u4 bucket_info = _buckets[i];
360       int bucket_type = BUCKET_TYPE(bucket_info);
361       int bucket_size;
362 
363       if (bucket_type == VALUE_ONLY_BUCKET_TYPE) {
364         bucket_size = 1;
365       } else {
366         bucket_size = (BUCKET_OFFSET(_buckets[i + 1]) - BUCKET_OFFSET(bucket_info)) / 2;
367       }
368       total_entries += bucket_size;
369       if (max_bucket < bucket_size) {
370         max_bucket = bucket_size;
371       }
372     }
373     st->print_cr("Number of buckets       : %9d", _bucket_count);
374     st->print_cr("Number of entries       : %9d", total_entries);
375     st->print_cr("Maximum bucket size     : %9d", max_bucket);
376   }
377 };
378 
379 ////////////////////////////////////////////////////////////////////////
380 //
381 // OffsetCompactHashtable -- This is used to store many types of objects
382 // in the CDS archive. On 64-bit platforms, we save space by using a 32-bit
383 // narrowPtr from the CDS base address.
384 
385 template <typename V>
386 inline V read_value_from_compact_hashtable(address base_address, u4 narrowp) {
387   return AOTCompressedPointers::decode_not_null<V>(cast_from_u4(narrowp), base_address);
388 }
389 
390 template <
391   typename K,
392   typename V,
393   bool (*EQUALS)(V value, K key, int len)
394   >
395 class OffsetCompactHashtable : public CompactHashtable<
396     K, V, read_value_from_compact_hashtable<V>, EQUALS> {
397 };
398 
399 
400 ////////////////////////////////////////////////////////////////////////
401 //
402 // Read/Write the contents of a hashtable textual dump (created by
403 // SymbolTable::dump and StringTable::dump).
404 // Because the dump file may be big (hundred of MB in extreme cases),
405 // we use mmap for fast access when reading it.
406 //
407 class HashtableTextDump {
408   int _fd;
409   const char* _base;
410   const char* _p;
411   const char* _end;
412   const char* _filename;
413   size_t      _size;
414   int         _prefix_type;
415   int         _line_no;
416 public:
417   HashtableTextDump(const char* filename);
418   ~HashtableTextDump();
419 
420   enum {
421     SymbolPrefix = 1 << 0,
422     StringPrefix = 1 << 1,
423     Unknown = 1 << 2
424   };
425 
426   void quit(const char* err, const char* msg);
427 
428   inline int remain() {
429     return (int)(_end - _p);
430   }
431   int last_line_no() {
432     return _line_no - 1;
433   }
434 
435   void corrupted(const char *p, const char *msg);
436 
437   inline void corrupted_if(bool cond, const char *msg) {
438     if (cond) {
439       corrupted(_p, msg);
440     }
441   }
442 
443   bool skip_newline();
444   int skip(char must_be_char);
445   void skip_past(char c);
446   void check_version(const char* ver);
447 
448   inline void get_num(char delim, int *num) {
449     const char* p   = _p;
450     const char* end = _end;
451     u8 n = 0;
452 
453     while (p < end) {
454       char c = *p++;
455       if ('0' <= c && c <= '9') {
456         n = n * 10 + (c - '0');
457         if (n > (u8)INT_MAX) {
458           corrupted(_p, "Num overflow");
459         }
460       } else if (c == delim) {
461         _p = p;
462         *num = (int)n;
463         return;
464       } else {
465         // Not [0-9], not 'delim'
466         corrupted(_p, "Unrecognized format");;
467       }
468     }
469 
470     corrupted(_end, "Incorrect format");
471     ShouldNotReachHere();
472   }
473 
474   void scan_prefix_type();
475   int scan_prefix(int* utf8_length);
476   int scan_string_prefix();
477   int scan_symbol_prefix();
478 
479   int unescape(const char* from, const char* end, int count);
480   void get_utf8(char* utf8_buffer, int utf8_length);
481   static void put_utf8(outputStream* st, const char* utf8_string, size_t utf8_length);
482 };
483 
484 #endif // SHARE_CLASSFILE_COMPACTHASHTABLE_HPP