1 /*
  2  * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #ifndef SHARE_CLASSFILE_COMPACTHASHTABLE_HPP
 26 #define SHARE_CLASSFILE_COMPACTHASHTABLE_HPP
 27 
 28 #include "cds/aotCompressedPointers.hpp"
 29 #include "cds/cds_globals.hpp"
 30 #include "oops/array.hpp"
 31 #include "oops/symbol.hpp"
 32 #include "runtime/globals.hpp"
 33 #include "utilities/growableArray.hpp"
 34 
 35 
 36 template <
 37   typename K,
 38   typename V,
 39   V (*DECODE)(address base_address, u4 encoded_value),
 40   bool (*EQUALS)(V value, K key, int len)
 41   >
 42 class CompactHashtable;
 43 class NumberSeq;
 44 class SimpleCompactHashtable;
 45 class SerializeClosure;
 46 
 47 // Stats for symbol tables in the CDS archive
 48 class CompactHashtableStats {
 49 public:
 50   int hashentry_count;
 51   int hashentry_bytes;
 52   int bucket_count;
 53   int bucket_bytes;
 54 
 55   CompactHashtableStats() :
 56     hashentry_count(0), hashentry_bytes(0),
 57     bucket_count(0), bucket_bytes(0) {}
 58 };
 59 
 60 #if INCLUDE_CDS
 61 /////////////////////////////////////////////////////////////////////////
 62 //
 63 // The compact hash table writer. Used at dump time for writing out
 64 // the compact table to the shared archive.
 65 //
 66 // At dump time, the CompactHashtableWriter obtains all entries from
 67 // a table (the table could be in any form of a collection of <hash, encoded_value> pair)
 68 // and adds them to a new temporary hash table (_buckets). The hash
 69 // table size (number of buckets) is calculated using
 70 // '(num_entries + bucket_size - 1) / bucket_size'. The default bucket
 71 // size is 4 and can be changed by -XX:SharedSymbolTableBucketSize option.
 72 // 4 is chosen because it produces smaller sized bucket on average for
 73 // faster lookup. It also has relatively small number of empty buckets and
 74 // good distribution of the entries.
 75 //
 76 // We use a simple hash function (hash % num_bucket) for the table.
 77 // The new table is compacted when written out. Please see comments
 78 // above the CompactHashtable class for the table layout detail. The bucket
 79 // offsets are written to the archive as part of the compact table. The
 80 // bucket offset is encoded in the low 30-bit (0-29) and the bucket type
 81 // (regular or value_only) are encoded in bit[31, 30]. For buckets with more
 82 // than one entry, both hash and encoded_value are written to the
 83 // table. For buckets with only one entry, only the encoded_value is written
 84 // to the table and the buckets are tagged as value_only in their type bits.
 85 // Buckets without entry are skipped from the table. Their offsets are
 86 // still written out for faster lookup.
 87 //
 88 class CompactHashtableWriter: public StackObj {
 89 public:
 90   class Entry {
 91     unsigned int _hash;
 92     u4 _encoded_value;
 93 
 94   public:
 95     Entry() {}
 96     Entry(unsigned int hash, u4 encoded_value) : _hash(hash), _encoded_value(encoded_value) {}
 97 
 98     u4 encoded_value() {
 99       return _encoded_value;
100     }
101     unsigned int hash() {
102       return _hash;
103     }
104 
105     bool operator==(const CompactHashtableWriter::Entry& other) {
106       return (_encoded_value == other._encoded_value && _hash == other._hash);
107     }
108   }; // class CompactHashtableWriter::Entry
109 
110 private:
111   int _num_entries_written;
112   int _num_buckets;
113   int _num_empty_buckets;
114   int _num_value_only_buckets;
115   int _num_other_buckets;
116   GrowableArray<Entry>** _buckets;
117   CompactHashtableStats* _stats;
118   Array<u4>* _compact_buckets;
119   Array<u4>* _compact_entries;
120 
121 public:
122   // This is called at dump-time only
123   CompactHashtableWriter(int num_entries, CompactHashtableStats* stats);
124   ~CompactHashtableWriter();
125 
126   void add(unsigned int hash, u4 encoded_value);
127   void add(unsigned int hash, AOTCompressedPointers::narrowPtr encoded_value) {
128     add(hash, cast_to_u4(encoded_value));
129   }
130   void dump(SimpleCompactHashtable *cht, const char* table_name);
131 
132 private:
133   void allocate_table();
134   void dump_table(NumberSeq* summary);
135   static int calculate_num_buckets(int num_entries) {
136     int num_buckets = num_entries / SharedSymbolTableBucketSize;
137     // calculation of num_buckets can result in zero buckets, we need at least one
138     return (num_buckets < 1) ? 1 : num_buckets;
139   }
140 };
141 #endif // INCLUDE_CDS
142 
143 #define REGULAR_BUCKET_TYPE       0
144 #define VALUE_ONLY_BUCKET_TYPE    1
145 #define TABLEEND_BUCKET_TYPE      3
146 #define BUCKET_OFFSET_MASK        0x3FFFFFFF
147 #define BUCKET_OFFSET(info)       ((info) & BUCKET_OFFSET_MASK)
148 #define BUCKET_TYPE_SHIFT         30
149 #define BUCKET_TYPE(info)         (((info) & ~BUCKET_OFFSET_MASK) >> BUCKET_TYPE_SHIFT)
150 #define BUCKET_INFO(offset, type) (((type) << BUCKET_TYPE_SHIFT) | ((offset) & BUCKET_OFFSET_MASK))
151 
152 /////////////////////////////////////////////////////////////////////////////
153 //
154 // CompactHashtable is used to store the CDS archive's tables.
155 // A table could be in any form of a collection of <hash, encoded_value> pair.
156 //
157 // Because these tables are read-only (no entries can be added/deleted) at run-time
158 // and tend to have large number of entries, we try to minimize the footprint
159 // cost per entry.
160 //
161 // The CompactHashtable is split into two arrays
162 //
163 //   u4 buckets[num_buckets+1]; // bit[31,30]: type; bit[29-0]: offset
164 //   u4 entries[<variable size>]
165 //
166 // The size of buckets[] is 'num_buckets + 1'. Each entry of
167 // buckets[] is a 32-bit encoding of the bucket type and bucket offset,
168 // with the type in the left-most 2-bit and offset in the remaining 30-bit.
169 //
170 // There are three types of buckets: regular, value_only, and table_end.
171 //  . The regular buckets have '00' in their highest 2-bit.
172 //  . The value_only buckets have '01' in their highest 2-bit.
173 //  . There is only a single table_end bucket that marks the end of buckets[].
174 //    It has '11' in its highest 2-bit.
175 //
176 // For regular buckets, each entry is 8 bytes in the entries[]:
177 //   u4 hash;          // entry hash
178 //   u4 encoded_value; // A 32-bit encoding of the template type V. The template parameter DECODE
179 //                     // converts this to type V. Many CompactHashtables encode a pointer as a 32-bit offset, where
180 //                     //   V entry = (V)(base_address + offset)
181 //                     // see StringTable, SymbolTable and AdapterHandlerLibrary for examples
182 //
183 // For value_only buckets, each entry has only the 4-byte 'encoded_value' in the entries[].
184 //
185 // The single table_end bucket has no corresponding entry.
186 //
187 // The number of entries in bucket <i> can be calculated like this:
188 //      my_offset   = _buckets[i]   & 0x3fffffff; // mask off top 2-bit
189 //      next_offset = _buckets[i+1] & 0x3fffffff
190 //  For REGULAR_BUCKET_TYPE
191 //      num_entries = (next_offset - my_offset) / 8;
192 //  For VALUE_ONLY_BUCKET_TYPE
193 //      num_entries = (next_offset - my_offset) / 4;
194 //
195 // If bucket <i> is empty, we have my_offset == next_offset. Empty buckets are
196 // always encoded as regular buckets.
197 //
198 // In the following example:
199 //   - Bucket #0 is a REGULAR_BUCKET_TYPE with two entries
200 //   - Bucket #1 is a VALUE_ONLY_BUCKET_TYPE with one entry.
201 //   - Bucket #2 is a REGULAR_BUCKET_TYPE with zero entries.
202 //
203 // buckets[0, 4, 5(empty), 5, ...., N(table_end)]
204 //         |  |  |         |        |
205 //         |  |  +---+-----+        |
206 //         |  |      |              |
207 //         |  +----+ +              |
208 //         v       v v              v
209 // entries[H,O,H,O,O,H,O,H,O........]
210 //
211 // See CompactHashtable::lookup() for how the table is searched at runtime.
212 // See CompactHashtableWriter::dump() for how the table is written at CDS
213 // dump time.
214 //
215 class SimpleCompactHashtable {
216 protected:
217   address  _base_address;
218   u4  _bucket_count;
219   u4  _entry_count;
220   u4* _buckets;
221   u4* _entries;
222 
223 public:
224   SimpleCompactHashtable() :
225     _base_address(nullptr),
226     _bucket_count(0),
227     _entry_count(0),
228     _buckets(nullptr),
229     _entries(nullptr)
230   {}
231 
232   void reset() {
233     _base_address = nullptr;
234     _bucket_count = 0;
235     _entry_count = 0;
236     _buckets = nullptr;
237     _entries = nullptr;
238   }
239 
240   void init(address base_address, u4 entry_count, u4 bucket_count, u4* buckets, u4* entries);
241 
242   // Read/Write the table's header from/to the CDS archive
243   void serialize_header(SerializeClosure* soc) NOT_CDS_RETURN;
244 
245   inline bool empty() const {
246     return (_entry_count == 0);
247   }
248 
249   inline size_t entry_count() const {
250     return _entry_count;
251   }
252 };
253 
254 template <
255   typename K,
256   typename V,
257   V (*DECODE)(address base_address, u4 encoded_value),
258   bool (*EQUALS)(V value, K key, int len)
259   >
260 class CompactHashtable : public SimpleCompactHashtable {
261 
262   V decode(u4 encoded_value) const {
263     return DECODE(_base_address, encoded_value);
264   }
265 
266 public:
267   // Lookup a value V from the compact table using key K
268   inline V lookup(K key, unsigned int hash, int len) const {
269     if (_entry_count > 0) {
270       int index = hash % _bucket_count;
271       u4 bucket_info = _buckets[index];
272       u4 bucket_offset = BUCKET_OFFSET(bucket_info);
273       int bucket_type = BUCKET_TYPE(bucket_info);
274       u4* entry = _entries + bucket_offset;
275 
276       if (bucket_type == VALUE_ONLY_BUCKET_TYPE) {
277         V value = decode(entry[0]);
278         if (EQUALS(value, key, len)) {
279           return value;
280         }
281       } else {
282         // This is a regular bucket, which has more than one
283         // entries. Each entry is a (hash, value) pair.
284         // Seek until the end of the bucket.
285         u4* entry_max = _entries + BUCKET_OFFSET(_buckets[index + 1]);
286         while (entry < entry_max) {
287           unsigned int h = (unsigned int)(entry[0]);
288           if (h == hash) {
289             V value = decode(entry[1]);
290             if (EQUALS(value, key, len)) {
291               return value;
292             }
293           }
294           entry += 2;
295         }
296       }
297     }
298     return nullptr;
299   }
300 
301   // Iterate through the values in the table, stopping when do_value() returns false.
302   template <class ITER>
303   inline void iterate(ITER* iter) const { iterate([&](V v) { iter->do_value(v); }); }
304 
305   template<typename Function>
306   inline void iterate(const Function& function) const { // lambda enabled API
307     iterate(const_cast<Function&>(function));
308   }
309 
310   // Iterate through the values in the table, stopping when the lambda returns false.
311   template<typename Function>
312   inline void iterate(Function& function) const { // lambda enabled API
313     for (u4 i = 0; i < _bucket_count; i++) {
314       u4 bucket_info = _buckets[i];
315       u4 bucket_offset = BUCKET_OFFSET(bucket_info);
316       int bucket_type = BUCKET_TYPE(bucket_info);
317       u4* entry = _entries + bucket_offset;
318 
319       if (bucket_type == VALUE_ONLY_BUCKET_TYPE) {
320         if (!function(decode(entry[0]))) {
321           return;
322         }
323       } else {
324         u4* entry_max = _entries + BUCKET_OFFSET(_buckets[i + 1]);
325         while (entry < entry_max) {
326           if (!function(decode(entry[1]))) {
327             return;
328           }
329           entry += 2;
330         }
331       }
332     }
333   }
334 
335   // Unconditionally iterate through all the values in the table
336   template <class ITER>
337   inline void iterate_all(ITER* iter) const { iterate_all([&](V v) { iter->do_value(v); }); }
338 
339   // Unconditionally iterate through all the values in the table using lambda
340   template<typename Function>
341   void iterate_all(Function function) const { // lambda enabled API
342     auto wrapper = [&] (V v) {
343       function(v);
344       return true;
345     };
346     iterate(wrapper);
347   }
348 
349   void print_table_statistics(outputStream* st, const char* name) {
350     st->print_cr("%s statistics:", name);
351     int total_entries = 0;
352     int max_bucket = 0;
353     for (u4 i = 0; i < _bucket_count; i++) {
354       u4 bucket_info = _buckets[i];
355       int bucket_type = BUCKET_TYPE(bucket_info);
356       int bucket_size;
357 
358       if (bucket_type == VALUE_ONLY_BUCKET_TYPE) {
359         bucket_size = 1;
360       } else {
361         bucket_size = (BUCKET_OFFSET(_buckets[i + 1]) - BUCKET_OFFSET(bucket_info)) / 2;
362       }
363       total_entries += bucket_size;
364       if (max_bucket < bucket_size) {
365         max_bucket = bucket_size;
366       }
367     }
368     st->print_cr("Number of buckets       : %9d", _bucket_count);
369     st->print_cr("Number of entries       : %9d", total_entries);
370     st->print_cr("Maximum bucket size     : %9d", max_bucket);
371   }
372 };
373 
374 ////////////////////////////////////////////////////////////////////////
375 //
376 // OffsetCompactHashtable -- This is used to store many types of objects
377 // in the CDS archive. On 64-bit platforms, we save space by using a 32-bit
378 // narrowPtr from the CDS base address.
379 
380 template <typename V>
381 inline V read_value_from_compact_hashtable(address base_address, u4 narrowp) {
382   return AOTCompressedPointers::decode_not_null<V>(cast_from_u4(narrowp), base_address);
383 }
384 
385 template <
386   typename K,
387   typename V,
388   bool (*EQUALS)(V value, K key, int len)
389   >
390 class OffsetCompactHashtable : public CompactHashtable<
391     K, V, read_value_from_compact_hashtable<V>, EQUALS> {
392 };
393 
394 
395 ////////////////////////////////////////////////////////////////////////
396 //
397 // Read/Write the contents of a hashtable textual dump (created by
398 // SymbolTable::dump and StringTable::dump).
399 // Because the dump file may be big (hundred of MB in extreme cases),
400 // we use mmap for fast access when reading it.
401 //
402 class HashtableTextDump {
403   int _fd;
404   const char* _base;
405   const char* _p;
406   const char* _end;
407   const char* _filename;
408   size_t      _size;
409   int         _prefix_type;
410   int         _line_no;
411 public:
412   HashtableTextDump(const char* filename);
413   ~HashtableTextDump();
414 
415   enum {
416     SymbolPrefix = 1 << 0,
417     StringPrefix = 1 << 1,
418     Unknown = 1 << 2
419   };
420 
421   void quit(const char* err, const char* msg);
422 
423   inline int remain() {
424     return (int)(_end - _p);
425   }
426   int last_line_no() {
427     return _line_no - 1;
428   }
429 
430   void corrupted(const char *p, const char *msg);
431 
432   inline void corrupted_if(bool cond, const char *msg) {
433     if (cond) {
434       corrupted(_p, msg);
435     }
436   }
437 
438   bool skip_newline();
439   int skip(char must_be_char);
440   void skip_past(char c);
441   void check_version(const char* ver);
442 
443   inline void get_num(char delim, int *num) {
444     const char* p   = _p;
445     const char* end = _end;
446     u8 n = 0;
447 
448     while (p < end) {
449       char c = *p++;
450       if ('0' <= c && c <= '9') {
451         n = n * 10 + (c - '0');
452         if (n > (u8)INT_MAX) {
453           corrupted(_p, "Num overflow");
454         }
455       } else if (c == delim) {
456         _p = p;
457         *num = (int)n;
458         return;
459       } else {
460         // Not [0-9], not 'delim'
461         corrupted(_p, "Unrecognized format");;
462       }
463     }
464 
465     corrupted(_end, "Incorrect format");
466     ShouldNotReachHere();
467   }
468 
469   void scan_prefix_type();
470   int scan_prefix(int* utf8_length);
471   int scan_string_prefix();
472   int scan_symbol_prefix();
473 
474   int unescape(const char* from, const char* end, int count);
475   void get_utf8(char* utf8_buffer, int utf8_length);
476   static void put_utf8(outputStream* st, const char* utf8_string, size_t utf8_length);
477 };
478 
479 #endif // SHARE_CLASSFILE_COMPACTHASHTABLE_HPP