1 /*
  2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #ifndef CPU_X86_STUBGENERATOR_X86_64_HPP
 26 #define CPU_X86_STUBGENERATOR_X86_64_HPP
 27 
 28 #include "code/codeBlob.hpp"
 29 #include "runtime/continuation.hpp"
 30 #include "runtime/stubCodeGenerator.hpp"
 31 #include "runtime/stubRoutines.hpp"
 32 
 33 // Stub Code definitions
 34 
 35 class StubGenerator: public StubCodeGenerator {
 36  private:
 37 
 38   // Call stubs are used to call Java from C.
 39   address generate_call_stub(address& return_address);
 40 
 41   // Return point for a Java call if there's an exception thrown in
 42   // Java code.  The exception is caught and transformed into a
 43   // pending exception stored in JavaThread that can be tested from
 44   // within the VM.
 45   //
 46   // Note: Usually the parameters are removed by the callee. In case
 47   // of an exception crossing an activation frame boundary, that is
 48   // not the case if the callee is compiled code => need to setup the
 49   // rsp.
 50   //
 51   // rax: exception oop
 52 
 53   address generate_catch_exception();
 54 
 55   // Continuation point for runtime calls returning with a pending
 56   // exception.  The pending exception check happened in the runtime
 57   // or native call stub.  The pending exception in Thread is
 58   // converted into a Java-level exception.
 59   //
 60   // Contract with Java-level exception handlers:
 61   // rax: exception
 62   // rdx: throwing pc
 63   //
 64   // NOTE: At entry of this stub, exception-pc must be on stack !!
 65 
 66   address generate_forward_exception();
 67 
 68   // Support for intptr_t OrderAccess::fence()
 69   address generate_orderaccess_fence();
 70 
 71   //----------------------------------------------------------------------------------------------------
 72   // Support for void verify_mxcsr()
 73   //
 74   // This routine is used with -Xcheck:jni to verify that native
 75   // JNI code does not return to Java code without restoring the
 76   // MXCSR register to our expected state.
 77 
 78   address generate_verify_mxcsr();
 79 
 80   address generate_f2i_fixup();
 81   address generate_f2l_fixup();
 82   address generate_d2i_fixup();
 83   address generate_d2l_fixup();
 84 
 85   address generate_count_leading_zeros_lut();
 86   address generate_popcount_avx_lut();
 87   address generate_iota_indices();
 88   address generate_vector_reverse_bit_lut();
 89 
 90   address generate_vector_reverse_byte_perm_mask_long();
 91   address generate_vector_reverse_byte_perm_mask_int();
 92   address generate_vector_reverse_byte_perm_mask_short();
 93   address generate_vector_byte_shuffle_mask();
 94 
 95   address generate_fp_mask(StubId stub_id, int64_t mask);
 96 
 97   address generate_compress_perm_table(StubId stub_id);
 98 
 99   address generate_expand_perm_table(StubId stub_id);
100 
101   address generate_vector_mask(StubId stub_id, int64_t mask);
102 
103   address generate_vector_byte_perm_mask();
104 
105   address generate_vector_fp_mask(StubId stub_id, int64_t mask);
106 
107   address generate_vector_custom_i32(StubId stub_id, Assembler::AvxVectorLen len,
108                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
109                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
110                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
111                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0);
112 
113   // Non-destructive plausibility checks for oops
114   address generate_verify_oop();
115 
116   // Verify that a register contains clean 32-bits positive value
117   // (high 32-bits are 0) so it could be used in 64-bits shifts.
118   void assert_clean_int(Register Rint, Register Rtmp);
119 
120   //  Generate overlap test for array copy stubs
121   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf);
122 
123   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
124     assert(no_overlap_target != nullptr, "must be generated");
125     array_overlap_test(no_overlap_target, nullptr, sf);
126   }
127   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
128     array_overlap_test(nullptr, &L_no_overlap, sf);
129   }
130 
131 
132   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
133   void setup_arg_regs(int nargs = 3);
134   void restore_arg_regs();
135 
136 #ifdef ASSERT
137   bool _regs_in_thread;
138 #endif
139 
140   // This is used in places where r10 is a scratch register, and can
141   // be adapted if r9 is needed also.
142   void setup_arg_regs_using_thread(int nargs = 3);
143 
144   void restore_arg_regs_using_thread();
145 
146   // Copy big chunks forward
147   void copy_bytes_forward(Register end_from, Register end_to,
148                           Register qword_count, Register tmp1,
149                           Register tmp2, Label& L_copy_bytes,
150                           Label& L_copy_8_bytes, DecoratorSet decorators,
151                           BasicType type);
152 
153   // Copy big chunks backward
154   void copy_bytes_backward(Register from, Register dest,
155                            Register qword_count, Register tmp1,
156                            Register tmp2, Label& L_copy_bytes,
157                            Label& L_copy_8_bytes, DecoratorSet decorators,
158                            BasicType type);
159 
160   void setup_argument_regs(BasicType type);
161 
162   void restore_argument_regs(BasicType type);
163 
164 #if COMPILER2_OR_JVMCI
165   // Following rules apply to AVX3 optimized arraycopy stubs:
166   // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
167   //   for both special cases (various small block sizes) and aligned copy loop. This is the
168   //   default configuration.
169   // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
170   //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
171   // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
172   //   better performance for disjoint copies. For conjoint/backward copy vector based
173   //   copy performs better.
174   // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
175   //   64 byte vector registers (ZMMs).
176 
177   address generate_disjoint_copy_avx3_masked(StubId stub_id, address* entry);
178 
179   address generate_conjoint_copy_avx3_masked(StubId stub_id, address* entry,
180                                              address nooverlap_target);
181 
182   void arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
183                                     Register to, Register count, int shift,
184                                     Register index, Register temp,
185                                     bool use64byteVector, Label& L_entry, Label& L_exit);
186 
187   void arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
188                                     Register to, Register count, int shift,
189                                     Register index, Register temp, Label& L_exit);
190 
191   void arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
192                                              Register to, Register start_index, Register end_index,
193                                              Register count, int shift, Register temp,
194                                              bool use64byteVector, Label& L_entry, Label& L_exit);
195 
196   void arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
197                             Register temp3, Register temp4, Register count,
198                             XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
199                             XMMRegister xmm4, int shift);
200 
201   void copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
202                   int shift = Address::times_1, int offset = 0);
203 
204   void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
205                   bool conjoint, int shift = Address::times_1, int offset = 0,
206                   bool use64byteVector = false);
207 
208   void copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1, XMMRegister xmm2,
209                                 XMMRegister xmm3, XMMRegister xmm4, int shift, int offset = 0);
210 
211   void copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
212                          KRegister mask, Register length, Register index,
213                          Register temp, int shift = Address::times_1, int offset = 0,
214                          bool use64byteVector = false);
215 
216   void copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
217                          KRegister mask, Register length, Register index,
218                          Register temp, int shift = Address::times_1, int offset = 0);
219 #endif // COMPILER2_OR_JVMCI
220 
221   address generate_disjoint_byte_copy(address* entry);
222 
223   address generate_conjoint_byte_copy(address nooverlap_target, address* entry);
224 
225   address generate_disjoint_short_copy(address *entry);
226 
227   address generate_fill(StubId stub_id);
228 
229   address generate_conjoint_short_copy(address nooverlap_target, address *entry);
230   address generate_disjoint_int_oop_copy(StubId stub_id, address* entry);
231   address generate_conjoint_int_oop_copy(StubId stub_id, address nooverlap_target,
232                                          address *entry);
233   address generate_disjoint_long_oop_copy(StubId stub_id, address* entry);
234   address generate_conjoint_long_oop_copy(StubId stub_id, address nooverlap_target,
235                                           address *entry);
236 
237   // Helper for generating a dynamic type check.
238   // Smashes no registers.
239   void generate_type_check(Register sub_klass,
240                            Register super_check_offset,
241                            Register super_klass,
242                            Label& L_success);
243 
244   // Generate checkcasting array copy stub
245   address generate_checkcast_copy(StubId stub_id, address *entry);
246 
247   // Generate 'unsafe' array copy stub
248   // Though just as safe as the other stubs, it takes an unscaled
249   // size_t argument instead of an element count.
250   //
251   // Examines the alignment of the operands and dispatches
252   // to a long, int, short, or byte copy loop.
253   address generate_unsafe_copy(address byte_copy_entry, address short_copy_entry,
254                                address int_copy_entry, address long_copy_entry);
255 
256   // Generate 'unsafe' set memory stub
257   // Though just as safe as the other stubs, it takes an unscaled
258   // size_t argument instead of an element count.
259   //
260   // Examines the alignment of the operands and dispatches
261   // to an int, short, or byte copy loop.
262   address generate_unsafe_setmemory(address byte_copy_entry);
263 
264   // Perform range checks on the proposed arraycopy.
265   // Kills temp, but nothing else.
266   // Also, clean the sign bits of src_pos and dst_pos.
267   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
268                               Register src_pos, // source position (c_rarg1)
269                               Register dst,     // destination array oo (c_rarg2)
270                               Register dst_pos, // destination position (c_rarg3)
271                               Register length,
272                               Register temp,
273                               Label& L_failed);
274 
275   // Generate generic array copy stubs
276   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
277                                 address int_copy_entry, address oop_copy_entry,
278                                 address long_copy_entry, address checkcast_copy_entry);
279 
280   address generate_data_cache_writeback();
281 
282   address generate_data_cache_writeback_sync();
283 
284   void generate_arraycopy_stubs();
285 
286 
287   // MD5 stubs
288 
289   // ofs and limit are use for multi-block byte array.
290   // int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
291   address generate_md5_implCompress(StubId stub_id);
292 
293 
294   // SHA stubs
295 
296   // ofs and limit are use for multi-block byte array.
297   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
298   address generate_sha1_implCompress(StubId stub_id);
299 
300   // ofs and limit are use for multi-block byte array.
301   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
302   address generate_sha256_implCompress(StubId stub_id);
303   address generate_sha512_implCompress(StubId stub_id);
304 
305   // Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
306   address generate_pshuffle_byte_flip_mask_sha512();
307 
308   address generate_upper_word_mask();
309   address generate_shuffle_byte_flip_mask();
310   address generate_pshuffle_byte_flip_mask();
311 
312 
313   // AES intrinsic stubs
314 
315   address generate_aescrypt_encryptBlock();
316 
317   address generate_aescrypt_decryptBlock();
318 
319   address generate_cipherBlockChaining_encryptAESCrypt();
320 
321   // A version of CBC/AES Decrypt which does 4 blocks in a loop at a time
322   // to hide instruction latency
323   address generate_cipherBlockChaining_decryptAESCrypt_Parallel();
324 
325   address generate_electronicCodeBook_encryptAESCrypt();
326 
327   void aesecb_encrypt(Register source_addr, Register dest_addr, Register key, Register len);
328 
329   address generate_electronicCodeBook_decryptAESCrypt();
330 
331   void aesecb_decrypt(Register source_addr, Register dest_addr, Register key, Register len);
332 
333   // Shared implementation for ECB/AES Encrypt and Decrypt, which does 4 blocks
334   // in a loop at a time to hide instruction latency. Set is_encrypt=true for
335   // encryption, false for decryption.
336   address generate_electronicCodeBook_AESCrypt_Parallel(bool is_encrypt);
337 
338   // A version of ECB/AES Encrypt which does 4 blocks in a loop at a time
339   // to hide instruction latency
340   address generate_electronicCodeBook_encryptAESCrypt_Parallel();
341 
342   // A version of ECB/AES Decrypt which does 4 blocks in a loop at a time
343   // to hide instruction latency
344   address generate_electronicCodeBook_decryptAESCrypt_Parallel();
345 
346   // Vector AES Galois Counter Mode implementation
347   address generate_galoisCounterMode_AESCrypt();
348   void aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key,
349                       Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter);
350 
351   // AVX2 AES Galois Counter Mode implementation
352   address generate_avx2_galoisCounterMode_AESCrypt();
353   void aesgcm_avx2(Register in, Register len, Register ct, Register out, Register key,
354                    Register state, Register subkeyHtbl, Register counter);
355 
356  // Vector AES Counter implementation
357   address generate_counterMode_VectorAESCrypt();
358   void aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
359                       Register len_reg, Register used, Register used_addr, Register saved_encCounter_start);
360 
361   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
362   // to hide instruction latency
363   address generate_counterMode_AESCrypt_Parallel();
364 
365   address generate_cipherBlockChaining_decryptVectorAESCrypt();
366 
367   address generate_key_shuffle_mask();
368 
369   void roundDec(XMMRegister xmm_reg);
370   void roundDeclast(XMMRegister xmm_reg);
371   void roundEnc(XMMRegister key, int rnum);
372   void lastroundEnc(XMMRegister key, int rnum);
373   void roundDec(XMMRegister key, int rnum);
374   void lastroundDec(XMMRegister key, int rnum);
375   void gfmul_avx512(XMMRegister ghash, XMMRegister hkey);
376   void ghash16_encrypt_parallel16_avx512(Register in, Register out, Register ct, Register pos, Register avx512_subkeyHtbl,
377                                          Register CTR_CHECK, Register NROUNDS, Register key, XMMRegister CTR, XMMRegister GHASH,
378                                          XMMRegister ADDBE_4x4, XMMRegister ADDBE_1234, XMMRegister ADD_1234, XMMRegister SHUF_MASK,
379                                          bool hk_broadcast, bool is_hash_start, bool do_hash_reduction, bool do_hash_hxor,
380                                          bool no_ghash_in, int ghashin_offset, int aesout_offset, int hashkey_offset);
381   void generateHtbl_32_blocks_avx512(Register htbl, Register avx512_htbl);
382   void initial_blocks_16_avx512(Register in, Register out, Register ct, Register pos, Register key, Register avx512_subkeyHtbl,
383                                 Register CTR_CHECK, Register rounds, XMMRegister CTR, XMMRegister GHASH,  XMMRegister ADDBE_4x4,
384                                 XMMRegister ADDBE_1234, XMMRegister ADD_1234, XMMRegister SHUF_MASK, int stack_offset);
385   void gcm_enc_dec_last_avx512(Register len, Register in, Register pos, XMMRegister HASH, XMMRegister SHUFM, Register subkeyHtbl,
386                                int ghashin_offset, int hashkey_offset, bool start_ghash, bool do_reduction);
387   void ghash16_avx512(bool start_ghash, bool do_reduction, bool uload_shuffle, bool hk_broadcast, bool do_hxor,
388                       Register in, Register pos, Register subkeyHtbl, XMMRegister HASH, XMMRegister SHUFM, int in_offset,
389                       int in_disp, int displacement, int hashkey_offset);
390   void aesgcm_avx512(Register in, Register len, Register ct, Register out, Register key,
391                      Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter);
392   // AVX2 AES-GCM related functions
393   void initial_blocks_avx2(XMMRegister ctr, Register rounds, Register key, Register len,
394                            Register in, Register out, Register ct, XMMRegister aad_hashx, Register pos);
395   void gfmul_avx2(XMMRegister GH, XMMRegister HK);
396   void generateHtbl_8_block_avx2(Register htbl);
397   void ghash8_encrypt8_parallel_avx2(Register key, Register subkeyHtbl, XMMRegister ctr_blockx, Register in,
398                                      Register out, Register ct, Register pos, bool out_order, Register rounds,
399                                      XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
400                                      XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, XMMRegister xmm8);
401   void ghash_last_8_avx2(Register subkeyHtbl);
402 
403   void check_key_offset(Register key, int offset, int load_size);
404 
405   // Load key and shuffle operation
406   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask);
407   void ev_load_key(XMMRegister xmmdst, Register key, int offset, Register rscratch);
408 
409   // Utility routine for loading a 128-bit key word in little endian format
410   // can optionally specify that the shuffle mask is already in an xmmregister
411   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask);
412   void load_key(XMMRegister xmmdst, Register key, int offset, Register rscratch);
413 
414   // Utility routine for increase 128bit counter (iv in CTR mode)
415   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block);
416   void ev_add128(XMMRegister xmmdst, XMMRegister xmmsrc1, XMMRegister xmmsrc2,
417                  int vector_len, KRegister ktmp, XMMRegister ones);
418   void generate_aes_stubs();
419 
420 
421   // GHASH stubs
422 
423   void generate_ghash_stubs();
424 
425   void schoolbookAAD(int i, Register subkeyH, XMMRegister data, XMMRegister tmp0,
426                      XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3);
427   void gfmul(XMMRegister tmp0, XMMRegister t);
428   void generateHtbl_one_block(Register htbl, Register rscratch);
429   void generateHtbl_eight_blocks(Register htbl);
430   void avx_ghash(Register state, Register htbl, Register data, Register blocks);
431 
432   // Used by GHASH and AES stubs.
433   address ghash_polynomial_addr();
434   address ghash_shufflemask_addr();
435   address ghash_long_swap_mask_addr(); // byte swap x86 long
436   address ghash_byte_swap_mask_addr(); // byte swap x86 byte array
437 
438   // Single and multi-block ghash operations
439   address generate_ghash_processBlocks();
440 
441   // Ghash single and multi block operations using AVX instructions
442   address generate_avx_ghash_processBlocks();
443 
444   // ChaCha20 stubs and helper functions
445   void generate_chacha_stubs();
446   address generate_chacha20Block_avx();
447   address generate_chacha20Block_avx512();
448   void cc20_quarter_round_avx(XMMRegister aVec, XMMRegister bVec,
449     XMMRegister cVec, XMMRegister dVec, XMMRegister scratch,
450     XMMRegister lrot8, XMMRegister lrot16, int vector_len);
451   void cc20_shift_lane_org(XMMRegister bVec, XMMRegister cVec,
452     XMMRegister dVec, int vector_len, bool colToDiag);
453   void cc20_keystream_collate_avx512(XMMRegister aVec, XMMRegister bVec,
454     XMMRegister cVec, XMMRegister dVec, Register baseAddr, int baseOffset);
455 
456   // Poly1305 multiblock using IFMA instructions
457   address generate_poly1305_processBlocks();
458   void poly1305_process_blocks_avx512(const Register input, const Register length,
459                                       const Register A0, const Register A1, const Register A2,
460                                       const Register R0, const Register R1, const Register C1);
461   void poly1305_multiply_scalar(const Register a0, const Register a1, const Register a2,
462                                 const Register r0, const Register r1, const Register c1, bool only128,
463                                 const Register t0, const Register t1, const Register t2,
464                                 const Register mulql, const Register mulqh);
465   void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
466                                  const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P,
467                                  const XMMRegister P0L, const XMMRegister P0H, const XMMRegister P1L, const XMMRegister P1H, const XMMRegister P2L, const XMMRegister P2H,
468                                  const XMMRegister TMP, const Register rscratch);
469   void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, const Register t0, const Register t1);
470   void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs, const Register t0, const Register t1);
471   void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1,
472                              const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG,
473                              const XMMRegister TMP, const Register rscratch);
474   //Poly305 AVX2 implementation
475   void poly1305_process_blocks_avx2(const Register input, const Register length,
476     const Register a0, const Register a1, const Register a2,
477     const Register r0, const Register r1, const Register c1);
478   void poly1305_msg_mul_reduce_vec4_avx2(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
479                                    const Address R0, const Address R1, const Address R2,
480                                    const Address R1P, const Address R2P,
481                                    const XMMRegister P0L, const XMMRegister P0H,
482                                    const XMMRegister P1L, const XMMRegister P1H,
483                                    const XMMRegister P2L, const XMMRegister P2H,
484                                    const XMMRegister YTMP1, const XMMRegister YTMP2,
485                                    const XMMRegister YTMP3, const XMMRegister YTMP4,
486                                    const XMMRegister YTMP5, const XMMRegister YTMP6,
487                                    const Register input, const Register length, const Register rscratch);
488   void poly1305_mul_reduce_vec4_avx2(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
489                                const XMMRegister R0, const XMMRegister R1, const XMMRegister R2,
490                                const XMMRegister R1P, const XMMRegister R2P,
491                                const XMMRegister P0L, const XMMRegister P0H,
492                                const XMMRegister P1L, const XMMRegister P1H,
493                                const XMMRegister P2L, const XMMRegister P2H,
494                                const XMMRegister YTMP1, const Register rscratch);
495 
496   address generate_intpoly_montgomeryMult_P256();
497   address generate_intpoly_assign();
498 
499   // SHA3 stubs
500   void generate_sha3_stubs();
501 
502   // Kyber stubs
503   void generate_kyber_stubs();
504 
505   // Dilithium stubs
506   void generate_dilithium_stubs();
507 
508   // BASE64 stubs
509   address base64_shuffle_addr();
510   address base64_avx2_shuffle_addr();
511   address base64_avx2_input_mask_addr();
512   address base64_avx2_lut_addr();
513   address base64_encoding_table_addr();
514 
515   // Code for generating Base64 encoding.
516   // Intrinsic function prototype in Base64.java:
517   // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
518   address generate_base64_encodeBlock();
519 
520   // base64 AVX512vbmi tables
521   address base64_vbmi_lookup_lo_addr();
522   address base64_vbmi_lookup_hi_addr();
523   address base64_vbmi_lookup_lo_url_addr();
524   address base64_vbmi_lookup_hi_url_addr();
525   address base64_vbmi_pack_vec_addr();
526   address base64_vbmi_join_0_1_addr();
527   address base64_vbmi_join_1_2_addr();
528   address base64_vbmi_join_2_3_addr();
529   address base64_decoding_table_addr();
530   address base64_AVX2_decode_tables_addr();
531   address base64_AVX2_decode_LUT_tables_addr();
532 
533   // Code for generating Base64 decoding.
534   //
535   // Based on the article (and associated code) from https://arxiv.org/abs/1910.05109.
536   //
537   // Intrinsic function prototype in Base64.java:
538   // private void decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, isMIME);
539   address generate_base64_decodeBlock();
540 
541   address generate_updateBytesCRC32();
542   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported);
543 
544   address generate_updateBytesAdler32();
545 
546   address generate_multiplyToLen();
547 
548   address generate_vectorizedMismatch();
549 
550   address generate_squareToLen();
551 
552   address generate_method_entry_barrier();
553 
554   address generate_mulAdd();
555 
556   address generate_bigIntegerRightShift();
557   address generate_bigIntegerLeftShift();
558 
559   address generate_float16ToFloat();
560   address generate_floatToFloat16();
561 
562   // Libm trigonometric stubs
563 
564   address generate_libmSin();
565   address generate_libmCos();
566   address generate_libmTan();
567   address generate_libmSinh();
568   address generate_libmTanh();
569   address generate_libmCbrt();
570   address generate_libmExp();
571   address generate_libmPow();
572   address generate_libmLog();
573   address generate_libmLog10();
574   address generate_libmFmod();
575 
576   // Shared constants
577   static address ZERO;
578   static address NEG_ZERO;
579   static address ONE;
580   static address ONEHALF;
581   static address SIGN_MASK;
582   static address TWO_POW_55;
583   static address TWO_POW_M55;
584   static address SHIFTER;
585   static address PI32INV;
586   static address PI_INV_TABLE;
587   static address Ctable;
588   static address SC_1;
589   static address SC_2;
590   static address SC_3;
591   static address SC_4;
592   static address PI_4;
593   static address P_1;
594   static address P_3;
595   static address P_2;
596 
597   void generate_libm_stubs();
598 
599 #ifdef COMPILER2
600   void generate_string_indexof(address *fnptrs);
601 #endif
602 
603   address generate_cont_thaw(StubId stub_id);
604   address generate_cont_thaw();
605 
606   // TODO: will probably need multiple return barriers depending on return type
607   address generate_cont_returnBarrier();
608   address generate_cont_returnBarrier_exception();
609 
610   address generate_cont_preempt_stub();
611 
612   // TODO -- delete this as it is not implemented?
613   //
614   // Continuation point for throwing of implicit exceptions that are
615   // not handled in the current activation. Fabricates an exception
616   // oop and initiates normal exception dispatching in this
617   // frame. Since we need to preserve callee-saved values (currently
618   // only for C2, but done for C1 as well) we need a callee-saved oop
619   // map and therefore have to make these stubs into RuntimeStubs
620   // rather than BufferBlobs.  If the compiler needs all registers to
621   // be preserved between the fault point and the exception handler
622   // then it must assume responsibility for that in
623   // AbstractCompiler::continuation_for_implicit_null_exception or
624   // continuation_for_implicit_division_by_zero_exception. All other
625   // implicit exceptions (e.g., NullPointerException or
626   // AbstractMethodError on entry) are either at call sites or
627   // otherwise assume that stack unwinding will be initiated, so
628   // caller saved registers were assumed volatile in the compiler.
629   address generate_throw_exception(const char* name,
630                                    address runtime_entry,
631                                    Register arg1 = noreg,
632                                    Register arg2 = noreg);
633 
634   // shared exception handler for FFM upcall stubs
635   address generate_upcall_stub_exception_handler();
636   address generate_upcall_stub_load_target();
637 
638   // interpreter or compiled code marshalling registers to/from inline type instance
639   address generate_return_value_stub(address destination, const char* name, bool has_res);
640 
641   // Specialized stub implementations for UseSecondarySupersTable.
642   void generate_lookup_secondary_supers_table_stub();
643 
644   // Slow path implementation for UseSecondarySupersTable.
645   address generate_lookup_secondary_supers_table_slow_path_stub();
646 
647   void create_control_words();
648 
649   // Initialization
650   void generate_preuniverse_stubs();
651   void generate_initial_stubs();
652   void generate_continuation_stubs();
653   void generate_compiler_stubs();
654   void generate_final_stubs();
655 
656 public:
657   StubGenerator(CodeBuffer* code, BlobId blob_id);
658 };
659 
660 #endif // CPU_X86_STUBGENERATOR_X86_64_HPP