1 /*
  2  * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #ifndef SHARE_CLASSFILE_COMPACTHASHTABLE_HPP
 26 #define SHARE_CLASSFILE_COMPACTHASHTABLE_HPP
 27 
 28 #include "cds/cds_globals.hpp"
 29 #include "oops/array.hpp"
 30 #include "oops/symbol.hpp"
 31 #include "runtime/globals.hpp"
 32 #include "utilities/growableArray.hpp"
 33 
 34 
 35 template <
 36   typename K,
 37   typename V,
 38   V (*DECODE)(address base_address, u4 offset),
 39   bool (*EQUALS)(V value, K key, int len)
 40   >
 41 class CompactHashtable;
 42 class NumberSeq;
 43 class SimpleCompactHashtable;
 44 class SerializeClosure;
 45 
 46 // Stats for symbol tables in the CDS archive
 47 class CompactHashtableStats {
 48 public:
 49   int hashentry_count;
 50   int hashentry_bytes;
 51   int bucket_count;
 52   int bucket_bytes;
 53 
 54   CompactHashtableStats() :
 55     hashentry_count(0), hashentry_bytes(0),
 56     bucket_count(0), bucket_bytes(0) {}
 57 };
 58 
 59 #if INCLUDE_CDS
 60 /////////////////////////////////////////////////////////////////////////
 61 //
 62 // The compact hash table writer. Used at dump time for writing out
 63 // the compact table to the shared archive.
 64 //
 65 // At dump time, the CompactHashtableWriter obtains all entries from the
 66 // symbol/string table and adds them to a new temporary hash table. The hash
 67 // table size (number of buckets) is calculated using
 68 // '(num_entries + bucket_size - 1) / bucket_size'. The default bucket
 69 // size is 4 and can be changed by -XX:SharedSymbolTableBucketSize option.
 70 // 4 is chosen because it produces smaller sized bucket on average for
 71 // faster lookup. It also has relatively small number of empty buckets and
 72 // good distribution of the entries.
 73 //
 74 // We use a simple hash function (hash % num_bucket) for the table.
 75 // The new table is compacted when written out. Please see comments
 76 // above the CompactHashtable class for the table layout detail. The bucket
 77 // offsets are written to the archive as part of the compact table. The
 78 // bucket offset is encoded in the low 30-bit (0-29) and the bucket type
 79 // (regular or compact) are encoded in bit[31, 30]. For buckets with more
 80 // than one entry, both hash and entry offset are written to the
 81 // table. For buckets with only one entry, only the entry offset is written
 82 // to the table and the buckets are tagged as compact in their type bits.
 83 // Buckets without entry are skipped from the table. Their offsets are
 84 // still written out for faster lookup.
 85 //
 86 class CompactHashtableWriter: public StackObj {
 87 public:
 88   class Entry {
 89     unsigned int _hash;
 90     u4 _value;
 91 
 92   public:
 93     Entry() {}
 94     Entry(unsigned int hash, u4 val) : _hash(hash), _value(val) {}
 95 
 96     u4 value() {
 97       return _value;
 98     }
 99     unsigned int hash() {
100       return _hash;
101     }
102 
103     bool operator==(const CompactHashtableWriter::Entry& other) {
104       return (_value == other._value && _hash == other._hash);
105     }
106   }; // class CompactHashtableWriter::Entry
107 
108 private:
109   int _num_entries_written;
110   int _num_buckets;
111   int _num_empty_buckets;
112   int _num_value_only_buckets;
113   int _num_other_buckets;
114   GrowableArray<Entry>** _buckets;
115   CompactHashtableStats* _stats;
116   Array<u4>* _compact_buckets;
117   Array<u4>* _compact_entries;
118 
119 public:
120   // This is called at dump-time only
121   CompactHashtableWriter(int num_entries, CompactHashtableStats* stats);
122   ~CompactHashtableWriter();
123 
124   void add(unsigned int hash, u4 value);
125 
126 private:
127   void allocate_table();
128   void dump_table(NumberSeq* summary);
129   static int calculate_num_buckets(int num_entries) {
130     int num_buckets = num_entries / SharedSymbolTableBucketSize;
131     // calculation of num_buckets can result in zero buckets, we need at least one
132     return (num_buckets < 1) ? 1 : num_buckets;
133   }
134 
135 public:
136   void dump(SimpleCompactHashtable *cht, const char* table_name);
137 
138   static size_t estimate_size(int num_entries);
139 };
140 #endif // INCLUDE_CDS
141 
142 #define REGULAR_BUCKET_TYPE       0
143 #define VALUE_ONLY_BUCKET_TYPE    1
144 #define TABLEEND_BUCKET_TYPE      3
145 #define BUCKET_OFFSET_MASK        0x3FFFFFFF
146 #define BUCKET_OFFSET(info)       ((info) & BUCKET_OFFSET_MASK)
147 #define BUCKET_TYPE_SHIFT         30
148 #define BUCKET_TYPE(info)         (((info) & ~BUCKET_OFFSET_MASK) >> BUCKET_TYPE_SHIFT)
149 #define BUCKET_INFO(offset, type) (((type) << BUCKET_TYPE_SHIFT) | ((offset) & BUCKET_OFFSET_MASK))
150 
151 /////////////////////////////////////////////////////////////////////////////
152 //
153 // CompactHashtable is used to store the CDS archive's symbol/string tables.
154 //
155 // Because these tables are read-only (no entries can be added/deleted) at run-time
156 // and tend to have large number of entries, we try to minimize the footprint
157 // cost per entry.
158 //
159 // The CompactHashtable is split into two arrays
160 //
161 //   u4 buckets[num_buckets+1]; // bit[31,30]: type; bit[29-0]: offset
162 //   u4 entries[<variable size>]
163 //
164 // The size of buckets[] is 'num_buckets + 1'. Each entry of
165 // buckets[] is a 32-bit encoding of the bucket type and bucket offset,
166 // with the type in the left-most 2-bit and offset in the remaining 30-bit.
167 // The last entry is a special type. It contains the end of the last
168 // bucket.
169 //
170 // There are two types of buckets, regular buckets and value_only buckets. The
171 // value_only buckets have '01' in their highest 2-bit, and regular buckets have
172 // '00' in their highest 2-bit.
173 //
174 // For normal buckets, each entry is 8 bytes in the entries[]:
175 //   u4 hash;    /* symbol/string hash */
176 //   union {
177 //     u4 offset;  /* Symbol* sym = (Symbol*)(base_address + offset) */
178 //     narrowOop str; /* String narrowOop encoding */
179 //   }
180 //
181 //
182 // For value_only buckets, each entry has only the 4-byte 'offset' in the entries[].
183 //
184 // Example -- note that the second bucket is a VALUE_ONLY_BUCKET_TYPE so the hash code
185 //            is skipped.
186 // buckets[0, 4, 5, ....]
187 //         |  |  |
188 //         |  |  +---+
189 //         |  |      |
190 //         |  +----+ |
191 //         v       v v
192 // entries[H,O,H,O,O,H,O,H,O.....]
193 //
194 // See CompactHashtable::lookup() for how the table is searched at runtime.
195 // See CompactHashtableWriter::dump() for how the table is written at CDS
196 // dump time.
197 //
198 class SimpleCompactHashtable {
199 protected:
200   address  _base_address;
201   u4  _bucket_count;
202   u4  _entry_count;
203   u4* _buckets;
204   u4* _entries;
205 
206 public:
207   SimpleCompactHashtable() :
208     _base_address(nullptr),
209     _bucket_count(0),
210     _entry_count(0),
211     _buckets(nullptr),
212     _entries(nullptr)
213   {}
214 
215   void reset() {
216     _base_address = nullptr;
217     _bucket_count = 0;
218     _entry_count = 0;
219     _buckets = nullptr;
220     _entries = nullptr;
221   }
222 
223   void init(address base_address, u4 entry_count, u4 bucket_count, u4* buckets, u4* entries);
224 
225   // Read/Write the table's header from/to the CDS archive
226   void serialize_header(SerializeClosure* soc) NOT_CDS_RETURN;
227 
228   inline bool empty() const {
229     return (_entry_count == 0);
230   }
231 
232   inline size_t entry_count() const {
233     return _entry_count;
234   }
235 
236   static size_t calculate_header_size();
237 };
238 
239 template <
240   typename K,
241   typename V,
242   V (*DECODE)(address base_address, u4 offset),
243   bool (*EQUALS)(V value, K key, int len)
244   >
245 class CompactHashtable : public SimpleCompactHashtable {
246   friend class VMStructs;
247 
248   V decode(u4 offset) const {
249     return DECODE(_base_address, offset);
250   }
251 
252 public:
253   // Lookup a value V from the compact table using key K
254   inline V lookup(K key, unsigned int hash, int len) const {
255     if (_entry_count > 0) {
256       int index = hash % _bucket_count;
257       u4 bucket_info = _buckets[index];
258       u4 bucket_offset = BUCKET_OFFSET(bucket_info);
259       int bucket_type = BUCKET_TYPE(bucket_info);
260       u4* entry = _entries + bucket_offset;
261 
262       if (bucket_type == VALUE_ONLY_BUCKET_TYPE) {
263         V value = decode(entry[0]);
264         if (EQUALS(value, key, len)) {
265           return value;
266         }
267       } else {
268         // This is a regular bucket, which has more than one
269         // entries. Each entry is a pair of entry (hash, offset).
270         // Seek until the end of the bucket.
271         u4* entry_max = _entries + BUCKET_OFFSET(_buckets[index + 1]);
272         while (entry < entry_max) {
273           unsigned int h = (unsigned int)(entry[0]);
274           if (h == hash) {
275             V value = decode(entry[1]);
276             if (EQUALS(value, key, len)) {
277               return value;
278             }
279           }
280           entry += 2;
281         }
282       }
283     }
284     return nullptr;
285   }
286 
287   template <class ITER>
288   inline void iterate(ITER* iter) const {
289     for (u4 i = 0; i < _bucket_count; i++) {
290       u4 bucket_info = _buckets[i];
291       u4 bucket_offset = BUCKET_OFFSET(bucket_info);
292       int bucket_type = BUCKET_TYPE(bucket_info);
293       u4* entry = _entries + bucket_offset;
294 
295       if (bucket_type == VALUE_ONLY_BUCKET_TYPE) {
296         iter->do_value(decode(entry[0]));
297       } else {
298         u4*entry_max = _entries + BUCKET_OFFSET(_buckets[i + 1]);
299         while (entry < entry_max) {
300           iter->do_value(decode(entry[1]));
301           entry += 2;
302         }
303       }
304     }
305   }
306 
307   template<typename Function>
308   inline void iterate(Function function) const { // lambda enabled API
309     for (u4 i = 0; i < _bucket_count; i++) {
310       u4 bucket_info = _buckets[i];
311       u4 bucket_offset = BUCKET_OFFSET(bucket_info);
312       int bucket_type = BUCKET_TYPE(bucket_info);
313       u4* entry = _entries + bucket_offset;
314 
315       if (bucket_type == VALUE_ONLY_BUCKET_TYPE) {
316         function(decode(entry[0]));
317       } else {
318         u4* entry_max = _entries + BUCKET_OFFSET(_buckets[i + 1]);
319         while (entry < entry_max) {
320           function(decode(entry[1]));
321           entry += 2;
322         }
323       }
324     }
325   }
326 
327   void print_table_statistics(outputStream* st, const char* name) {
328     st->print_cr("%s statistics:", name);
329     int total_entries = 0;
330     int max_bucket = 0;
331     for (u4 i = 0; i < _bucket_count; i++) {
332       u4 bucket_info = _buckets[i];
333       int bucket_type = BUCKET_TYPE(bucket_info);
334       int bucket_size;
335 
336       if (bucket_type == VALUE_ONLY_BUCKET_TYPE) {
337         bucket_size = 1;
338       } else {
339         bucket_size = (BUCKET_OFFSET(_buckets[i + 1]) - BUCKET_OFFSET(bucket_info)) / 2;
340       }
341       total_entries += bucket_size;
342       if (max_bucket < bucket_size) {
343         max_bucket = bucket_size;
344       }
345     }
346     st->print_cr("Number of buckets       : %9d", _bucket_count);
347     st->print_cr("Number of entries       : %9d", total_entries);
348     st->print_cr("Maximum bucket size     : %9d", max_bucket);
349   }
350 };
351 
352 ////////////////////////////////////////////////////////////////////////
353 //
354 // OffsetCompactHashtable -- This is used to store many types of objects
355 // in the CDS archive. On 64-bit platforms, we save space by using a 32-bit
356 // offset from the CDS base address.
357 
358 template <typename V>
359 inline V read_value_from_compact_hashtable(address base_address, u4 offset) {
360   return (V)(base_address + offset);
361 }
362 
363 template <
364   typename K,
365   typename V,
366   bool (*EQUALS)(V value, K key, int len)
367   >
368 class OffsetCompactHashtable : public CompactHashtable<
369     K, V, read_value_from_compact_hashtable<V>, EQUALS> {
370 };
371 
372 
373 ////////////////////////////////////////////////////////////////////////
374 //
375 // Read/Write the contents of a hashtable textual dump (created by
376 // SymbolTable::dump and StringTable::dump).
377 // Because the dump file may be big (hundred of MB in extreme cases),
378 // we use mmap for fast access when reading it.
379 //
380 class HashtableTextDump {
381   int _fd;
382   const char* _base;
383   const char* _p;
384   const char* _end;
385   const char* _filename;
386   size_t      _size;
387   int         _prefix_type;
388   int         _line_no;
389 public:
390   HashtableTextDump(const char* filename);
391   ~HashtableTextDump();
392 
393   enum {
394     SymbolPrefix = 1 << 0,
395     StringPrefix = 1 << 1,
396     Unknown = 1 << 2
397   };
398 
399   void quit(const char* err, const char* msg);
400 
401   inline int remain() {
402     return (int)(_end - _p);
403   }
404   int last_line_no() {
405     return _line_no - 1;
406   }
407 
408   void corrupted(const char *p, const char *msg);
409 
410   inline void corrupted_if(bool cond, const char *msg) {
411     if (cond) {
412       corrupted(_p, msg);
413     }
414   }
415 
416   bool skip_newline();
417   int skip(char must_be_char);
418   void skip_past(char c);
419   void check_version(const char* ver);
420 
421   inline void get_num(char delim, int *num) {
422     const char* p   = _p;
423     const char* end = _end;
424     u8 n = 0;
425 
426     while (p < end) {
427       char c = *p++;
428       if ('0' <= c && c <= '9') {
429         n = n * 10 + (c - '0');
430         if (n > (u8)INT_MAX) {
431           corrupted(_p, "Num overflow");
432         }
433       } else if (c == delim) {
434         _p = p;
435         *num = (int)n;
436         return;
437       } else {
438         // Not [0-9], not 'delim'
439         corrupted(_p, "Unrecognized format");;
440       }
441     }
442 
443     corrupted(_end, "Incorrect format");
444     ShouldNotReachHere();
445   }
446 
447   void scan_prefix_type();
448   int scan_prefix(int* utf8_length);
449   int scan_string_prefix();
450   int scan_symbol_prefix();
451 
452   int unescape(const char* from, const char* end, int count);
453   void get_utf8(char* utf8_buffer, int utf8_length);
454   static void put_utf8(outputStream* st, const char* utf8_string, int utf8_length);
455 };
456 
457 #endif // SHARE_CLASSFILE_COMPACTHASHTABLE_HPP