1 /*
  2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #ifndef CPU_X86_STUBGENERATOR_X86_64_HPP
 26 #define CPU_X86_STUBGENERATOR_X86_64_HPP
 27 
 28 #include "code/codeBlob.hpp"
 29 #include "runtime/continuation.hpp"
 30 #include "runtime/stubCodeGenerator.hpp"
 31 
 32 // Stub Code definitions
 33 
 34 class StubGenerator: public StubCodeGenerator {
 35  private:
 36 
 37   // Call stubs are used to call Java from C.
 38   address generate_call_stub(address& return_address);
 39 
 40   // Return point for a Java call if there's an exception thrown in
 41   // Java code.  The exception is caught and transformed into a
 42   // pending exception stored in JavaThread that can be tested from
 43   // within the VM.
 44   //
 45   // Note: Usually the parameters are removed by the callee. In case
 46   // of an exception crossing an activation frame boundary, that is
 47   // not the case if the callee is compiled code => need to setup the
 48   // rsp.
 49   //
 50   // rax: exception oop
 51 
 52   address generate_catch_exception();
 53 
 54   // Continuation point for runtime calls returning with a pending
 55   // exception.  The pending exception check happened in the runtime
 56   // or native call stub.  The pending exception in Thread is
 57   // converted into a Java-level exception.
 58   //
 59   // Contract with Java-level exception handlers:
 60   // rax: exception
 61   // rdx: throwing pc
 62   //
 63   // NOTE: At entry of this stub, exception-pc must be on stack !!
 64 
 65   address generate_forward_exception();
 66 
 67   // Support for intptr_t OrderAccess::fence()
 68   address generate_orderaccess_fence();
 69 
 70   // Support for intptr_t get_previous_sp()
 71   //
 72   // This routine is used to find the previous stack pointer for the
 73   // caller.
 74   address generate_get_previous_sp();
 75 
 76   //----------------------------------------------------------------------------------------------------
 77   // Support for void verify_mxcsr()
 78   //
 79   // This routine is used with -Xcheck:jni to verify that native
 80   // JNI code does not return to Java code without restoring the
 81   // MXCSR register to our expected state.
 82 
 83   address generate_verify_mxcsr();
 84 
 85   address generate_f2i_fixup();
 86   address generate_f2l_fixup();
 87   address generate_d2i_fixup();
 88   address generate_d2l_fixup();
 89 
 90   address generate_count_leading_zeros_lut(const char *stub_name);
 91   address generate_popcount_avx_lut(const char *stub_name);
 92   address generate_iota_indices(const char *stub_name);
 93   address generate_vector_reverse_bit_lut(const char *stub_name);
 94 
 95   address generate_vector_reverse_byte_perm_mask_long(const char *stub_name);
 96   address generate_vector_reverse_byte_perm_mask_int(const char *stub_name);
 97   address generate_vector_reverse_byte_perm_mask_short(const char *stub_name);
 98   address generate_vector_byte_shuffle_mask(const char *stub_name);
 99 
100   address generate_fp_mask(const char *stub_name, int64_t mask);
101 
102   address generate_compress_perm_table(const char *stub_name, int32_t esize);
103 
104   address generate_expand_perm_table(const char *stub_name, int32_t esize);
105 
106   address generate_vector_mask(const char *stub_name, int64_t mask);
107 
108   address generate_vector_byte_perm_mask(const char *stub_name);
109 
110   address generate_vector_fp_mask(const char *stub_name, int64_t mask);
111 
112   address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
113                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
114                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
115                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
116                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0);
117 
118   // Non-destructive plausibility checks for oops
119   address generate_verify_oop();
120 
121   // Verify that a register contains clean 32-bits positive value
122   // (high 32-bits are 0) so it could be used in 64-bits shifts.
123   void assert_clean_int(Register Rint, Register Rtmp);
124 
125   //  Generate overlap test for array copy stubs
126   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf);
127 
128   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
129     assert(no_overlap_target != nullptr, "must be generated");
130     array_overlap_test(no_overlap_target, nullptr, sf);
131   }
132   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
133     array_overlap_test(nullptr, &L_no_overlap, sf);
134   }
135 
136 
137   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
138   void setup_arg_regs(int nargs = 3);
139   void restore_arg_regs();
140 
141 #ifdef ASSERT
142   bool _regs_in_thread;
143 #endif
144 
145   // This is used in places where r10 is a scratch register, and can
146   // be adapted if r9 is needed also.
147   void setup_arg_regs_using_thread(int nargs = 3);
148 
149   void restore_arg_regs_using_thread();
150 
151   // Copy big chunks forward
152   void copy_bytes_forward(Register end_from, Register end_to,
153                           Register qword_count, Register tmp1,
154                           Register tmp2, Label& L_copy_bytes,
155                           Label& L_copy_8_bytes, DecoratorSet decorators,
156                           BasicType type);
157 
158   // Copy big chunks backward
159   void copy_bytes_backward(Register from, Register dest,
160                            Register qword_count, Register tmp1,
161                            Register tmp2, Label& L_copy_bytes,
162                            Label& L_copy_8_bytes, DecoratorSet decorators,
163                            BasicType type);
164 
165   void setup_argument_regs(BasicType type);
166 
167   void restore_argument_regs(BasicType type);
168 
169 #if COMPILER2_OR_JVMCI
170   // Following rules apply to AVX3 optimized arraycopy stubs:
171   // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
172   //   for both special cases (various small block sizes) and aligned copy loop. This is the
173   //   default configuration.
174   // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
175   //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
176   // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
177   //   better performance for disjoint copies. For conjoint/backward copy vector based
178   //   copy performs better.
179   // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
180   //   64 byte vector registers (ZMMs).
181 
182   address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift,
183                                              bool aligned, bool is_oop, bool dest_uninitialized);
184 
185   address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
186                                              address nooverlap_target, bool aligned, bool is_oop,
187                                              bool dest_uninitialized);
188 
189   void arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
190                                     Register to, Register count, int shift,
191                                     Register index, Register temp,
192                                     bool use64byteVector, Label& L_entry, Label& L_exit);
193 
194   void arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
195                                     Register to, Register count, int shift,
196                                     Register index, Register temp, Label& L_exit);
197 
198   void arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
199                                              Register to, Register start_index, Register end_index,
200                                              Register count, int shift, Register temp,
201                                              bool use64byteVector, Label& L_entry, Label& L_exit);
202 
203   void arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
204                             Register temp3, Register temp4, Register count,
205                             XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
206                             XMMRegister xmm4, int shift);
207 
208   void copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
209                   int shift = Address::times_1, int offset = 0);
210 
211   void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
212                   bool conjoint, int shift = Address::times_1, int offset = 0,
213                   bool use64byteVector = false);
214 
215   void copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1, XMMRegister xmm2,
216                                 XMMRegister xmm3, XMMRegister xmm4, int shift, int offset = 0);
217 
218   void copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
219                          KRegister mask, Register length, Register index,
220                          Register temp, int shift = Address::times_1, int offset = 0,
221                          bool use64byteVector = false);
222 
223   void copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
224                          KRegister mask, Register length, Register index,
225                          Register temp, int shift = Address::times_1, int offset = 0);
226 #endif // COMPILER2_OR_JVMCI
227 
228   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name);
229 
230   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
231                                       address* entry, const char *name);
232 
233   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name);
234 
235   address generate_fill(BasicType t, bool aligned, const char *name);
236 
237   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
238                                        address *entry, const char *name);
239   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
240                                          const char *name, bool dest_uninitialized = false);
241   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
242                                          address *entry, const char *name,
243                                          bool dest_uninitialized = false);
244   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
245                                           const char *name, bool dest_uninitialized = false);
246   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
247                                           address nooverlap_target, address *entry,
248                                           const char *name, bool dest_uninitialized = false);
249 
250   // Helper for generating a dynamic type check.
251   // Smashes no registers.
252   void generate_type_check(Register sub_klass,
253                            Register super_check_offset,
254                            Register super_klass,
255                            Label& L_success);
256 
257   // Generate checkcasting array copy stub
258   address generate_checkcast_copy(const char *name, address *entry,
259                                   bool dest_uninitialized = false);
260 
261   // Generate 'unsafe' array copy stub
262   // Though just as safe as the other stubs, it takes an unscaled
263   // size_t argument instead of an element count.
264   //
265   // Examines the alignment of the operands and dispatches
266   // to a long, int, short, or byte copy loop.
267   address generate_unsafe_copy(const char *name,
268                                address byte_copy_entry, address short_copy_entry,
269                                address int_copy_entry, address long_copy_entry);
270 
271   // Generate 'unsafe' set memory stub
272   // Though just as safe as the other stubs, it takes an unscaled
273   // size_t argument instead of an element count.
274   //
275   // Examines the alignment of the operands and dispatches
276   // to an int, short, or byte copy loop.
277   address generate_unsafe_setmemory(const char *name, address byte_copy_entry);
278 
279   // Perform range checks on the proposed arraycopy.
280   // Kills temp, but nothing else.
281   // Also, clean the sign bits of src_pos and dst_pos.
282   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
283                               Register src_pos, // source position (c_rarg1)
284                               Register dst,     // destination array oo (c_rarg2)
285                               Register dst_pos, // destination position (c_rarg3)
286                               Register length,
287                               Register temp,
288                               Label& L_failed);
289 
290   // Generate generic array copy stubs
291   address generate_generic_copy(const char *name,
292                                 address byte_copy_entry, address short_copy_entry,
293                                 address int_copy_entry, address oop_copy_entry,
294                                 address long_copy_entry, address checkcast_copy_entry);
295 
296   address generate_data_cache_writeback();
297 
298   address generate_data_cache_writeback_sync();
299 
300   void generate_arraycopy_stubs();
301 
302 
303   // MD5 stubs
304 
305   // ofs and limit are use for multi-block byte array.
306   // int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
307   address generate_md5_implCompress(bool multi_block, const char *name);
308 
309 
310   // SHA stubs
311 
312   // ofs and limit are use for multi-block byte array.
313   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
314   address generate_sha1_implCompress(bool multi_block, const char *name);
315 
316   // ofs and limit are use for multi-block byte array.
317   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
318   address generate_sha256_implCompress(bool multi_block, const char *name);
319   address generate_sha512_implCompress(bool multi_block, const char *name);
320 
321   // Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
322   address generate_pshuffle_byte_flip_mask_sha512();
323 
324   address generate_upper_word_mask();
325   address generate_shuffle_byte_flip_mask();
326   address generate_pshuffle_byte_flip_mask();
327 
328 
329   // AES intrinsic stubs
330 
331   address generate_aescrypt_encryptBlock();
332 
333   address generate_aescrypt_decryptBlock();
334 
335   address generate_cipherBlockChaining_encryptAESCrypt();
336 
337   // A version of CBC/AES Decrypt which does 4 blocks in a loop at a time
338   // to hide instruction latency
339   address generate_cipherBlockChaining_decryptAESCrypt_Parallel();
340 
341   address generate_electronicCodeBook_encryptAESCrypt();
342 
343   void aesecb_encrypt(Register source_addr, Register dest_addr, Register key, Register len);
344 
345   address generate_electronicCodeBook_decryptAESCrypt();
346 
347   void aesecb_decrypt(Register source_addr, Register dest_addr, Register key, Register len);
348 
349   // Vector AES Galois Counter Mode implementation
350   address generate_galoisCounterMode_AESCrypt();
351   void aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key,
352                       Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter);
353 
354   // AVX2 AES Galois Counter Mode implementation
355   address generate_avx2_galoisCounterMode_AESCrypt();
356   void aesgcm_avx2(Register in, Register len, Register ct, Register out, Register key,
357                    Register state, Register subkeyHtbl, Register counter);
358 
359  // Vector AES Counter implementation
360   address generate_counterMode_VectorAESCrypt();
361   void aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
362                       Register len_reg, Register used, Register used_addr, Register saved_encCounter_start);
363 
364   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
365   // to hide instruction latency
366   address generate_counterMode_AESCrypt_Parallel();
367 
368   address generate_cipherBlockChaining_decryptVectorAESCrypt();
369 
370   address generate_key_shuffle_mask();
371 
372   void roundDec(XMMRegister xmm_reg);
373   void roundDeclast(XMMRegister xmm_reg);
374   void roundEnc(XMMRegister key, int rnum);
375   void lastroundEnc(XMMRegister key, int rnum);
376   void roundDec(XMMRegister key, int rnum);
377   void lastroundDec(XMMRegister key, int rnum);
378   void gfmul_avx512(XMMRegister ghash, XMMRegister hkey);
379   void ghash16_encrypt_parallel16_avx512(Register in, Register out, Register ct, Register pos, Register avx512_subkeyHtbl,
380                                          Register CTR_CHECK, Register NROUNDS, Register key, XMMRegister CTR, XMMRegister GHASH,
381                                          XMMRegister ADDBE_4x4, XMMRegister ADDBE_1234, XMMRegister ADD_1234, XMMRegister SHUF_MASK,
382                                          bool hk_broadcast, bool is_hash_start, bool do_hash_reduction, bool do_hash_hxor,
383                                          bool no_ghash_in, int ghashin_offset, int aesout_offset, int hashkey_offset);
384   void generateHtbl_32_blocks_avx512(Register htbl, Register avx512_htbl);
385   void initial_blocks_16_avx512(Register in, Register out, Register ct, Register pos, Register key, Register avx512_subkeyHtbl,
386                                 Register CTR_CHECK, Register rounds, XMMRegister CTR, XMMRegister GHASH,  XMMRegister ADDBE_4x4,
387                                 XMMRegister ADDBE_1234, XMMRegister ADD_1234, XMMRegister SHUF_MASK, int stack_offset);
388   void gcm_enc_dec_last_avx512(Register len, Register in, Register pos, XMMRegister HASH, XMMRegister SHUFM, Register subkeyHtbl,
389                                int ghashin_offset, int hashkey_offset, bool start_ghash, bool do_reduction);
390   void ghash16_avx512(bool start_ghash, bool do_reduction, bool uload_shuffle, bool hk_broadcast, bool do_hxor,
391                       Register in, Register pos, Register subkeyHtbl, XMMRegister HASH, XMMRegister SHUFM, int in_offset,
392                       int in_disp, int displacement, int hashkey_offset);
393   void aesgcm_avx512(Register in, Register len, Register ct, Register out, Register key,
394                      Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter);
395   // AVX2 AES-GCM related functions
396   void initial_blocks_avx2(XMMRegister ctr, Register rounds, Register key, Register len,
397                            Register in, Register out, Register ct, XMMRegister aad_hashx, Register pos);
398   void gfmul_avx2(XMMRegister GH, XMMRegister HK);
399   void generateHtbl_8_block_avx2(Register htbl);
400   void ghash8_encrypt8_parallel_avx2(Register key, Register subkeyHtbl, XMMRegister ctr_blockx, Register in,
401                                      Register out, Register ct, Register pos, bool out_order, Register rounds,
402                                      XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
403                                      XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, XMMRegister xmm8);
404   void ghash_last_8_avx2(Register subkeyHtbl);
405 
406   // Load key and shuffle operation
407   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask);
408   void ev_load_key(XMMRegister xmmdst, Register key, int offset, Register rscratch);
409 
410   // Utility routine for loading a 128-bit key word in little endian format
411   // can optionally specify that the shuffle mask is already in an xmmregister
412   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask);
413   void load_key(XMMRegister xmmdst, Register key, int offset, Register rscratch);
414 
415   // Utility routine for increase 128bit counter (iv in CTR mode)
416   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block);
417   void ev_add128(XMMRegister xmmdst, XMMRegister xmmsrc1, XMMRegister xmmsrc2,
418                  int vector_len, KRegister ktmp, XMMRegister ones);
419   void generate_aes_stubs();
420 
421 
422   // GHASH stubs
423 
424   void generate_ghash_stubs();
425 
426   void schoolbookAAD(int i, Register subkeyH, XMMRegister data, XMMRegister tmp0,
427                      XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3);
428   void gfmul(XMMRegister tmp0, XMMRegister t);
429   void generateHtbl_one_block(Register htbl, Register rscratch);
430   void generateHtbl_eight_blocks(Register htbl);
431   void avx_ghash(Register state, Register htbl, Register data, Register blocks);
432 
433   // Used by GHASH and AES stubs.
434   address ghash_polynomial_addr();
435   address ghash_shufflemask_addr();
436   address ghash_long_swap_mask_addr(); // byte swap x86 long
437   address ghash_byte_swap_mask_addr(); // byte swap x86 byte array
438 
439   // Single and multi-block ghash operations
440   address generate_ghash_processBlocks();
441 
442   // Ghash single and multi block operations using AVX instructions
443   address generate_avx_ghash_processBlocks();
444 
445   // ChaCha20 stubs and helper functions
446   void generate_chacha_stubs();
447   address generate_chacha20Block_avx();
448   address generate_chacha20Block_avx512();
449   void cc20_quarter_round_avx(XMMRegister aVec, XMMRegister bVec,
450     XMMRegister cVec, XMMRegister dVec, XMMRegister scratch,
451     XMMRegister lrot8, XMMRegister lrot16, int vector_len);
452   void cc20_shift_lane_org(XMMRegister bVec, XMMRegister cVec,
453     XMMRegister dVec, int vector_len, bool colToDiag);
454   void cc20_keystream_collate_avx512(XMMRegister aVec, XMMRegister bVec,
455     XMMRegister cVec, XMMRegister dVec, Register baseAddr, int baseOffset);
456 
457   // Poly1305 multiblock using IFMA instructions
458   address generate_poly1305_processBlocks();
459   void poly1305_process_blocks_avx512(const Register input, const Register length,
460                                       const Register A0, const Register A1, const Register A2,
461                                       const Register R0, const Register R1, const Register C1);
462   void poly1305_multiply_scalar(const Register a0, const Register a1, const Register a2,
463                                 const Register r0, const Register r1, const Register c1, bool only128,
464                                 const Register t0, const Register t1, const Register t2,
465                                 const Register mulql, const Register mulqh);
466   void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
467                                  const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P,
468                                  const XMMRegister P0L, const XMMRegister P0H, const XMMRegister P1L, const XMMRegister P1H, const XMMRegister P2L, const XMMRegister P2H,
469                                  const XMMRegister TMP, const Register rscratch);
470   void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, const Register t0, const Register t1);
471   void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs, const Register t0, const Register t1);
472   void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1,
473                              const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG,
474                              const XMMRegister TMP, const Register rscratch);
475   //Poly305 AVX2 implementation
476   void poly1305_process_blocks_avx2(const Register input, const Register length,
477     const Register a0, const Register a1, const Register a2,
478     const Register r0, const Register r1, const Register c1);
479   void poly1305_msg_mul_reduce_vec4_avx2(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
480                                    const Address R0, const Address R1, const Address R2,
481                                    const Address R1P, const Address R2P,
482                                    const XMMRegister P0L, const XMMRegister P0H,
483                                    const XMMRegister P1L, const XMMRegister P1H,
484                                    const XMMRegister P2L, const XMMRegister P2H,
485                                    const XMMRegister YTMP1, const XMMRegister YTMP2,
486                                    const XMMRegister YTMP3, const XMMRegister YTMP4,
487                                    const XMMRegister YTMP5, const XMMRegister YTMP6,
488                                    const Register input, const Register length, const Register rscratch);
489   void poly1305_mul_reduce_vec4_avx2(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
490                                const XMMRegister R0, const XMMRegister R1, const XMMRegister R2,
491                                const XMMRegister R1P, const XMMRegister R2P,
492                                const XMMRegister P0L, const XMMRegister P0H,
493                                const XMMRegister P1L, const XMMRegister P1H,
494                                const XMMRegister P2L, const XMMRegister P2H,
495                                const XMMRegister YTMP1, const Register rscratch);
496 
497   address generate_intpoly_montgomeryMult_P256();
498   address generate_intpoly_assign();
499 
500   // SHA3 stubs
501   void generate_sha3_stubs();
502   address generate_sha3_implCompress(bool multiBlock, const char *name);
503 
504   // BASE64 stubs
505 
506   address base64_shuffle_addr();
507   address base64_avx2_shuffle_addr();
508   address base64_avx2_input_mask_addr();
509   address base64_avx2_lut_addr();
510   address base64_encoding_table_addr();
511 
512   // Code for generating Base64 encoding.
513   // Intrinsic function prototype in Base64.java:
514   // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
515   address generate_base64_encodeBlock();
516 
517   // base64 AVX512vbmi tables
518   address base64_vbmi_lookup_lo_addr();
519   address base64_vbmi_lookup_hi_addr();
520   address base64_vbmi_lookup_lo_url_addr();
521   address base64_vbmi_lookup_hi_url_addr();
522   address base64_vbmi_pack_vec_addr();
523   address base64_vbmi_join_0_1_addr();
524   address base64_vbmi_join_1_2_addr();
525   address base64_vbmi_join_2_3_addr();
526   address base64_decoding_table_addr();
527   address base64_AVX2_decode_tables_addr();
528   address base64_AVX2_decode_LUT_tables_addr();
529 
530   // Code for generating Base64 decoding.
531   //
532   // Based on the article (and associated code) from https://arxiv.org/abs/1910.05109.
533   //
534   // Intrinsic function prototype in Base64.java:
535   // private void decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, isMIME);
536   address generate_base64_decodeBlock();
537 
538   address generate_updateBytesCRC32();
539   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported);
540 
541   address generate_updateBytesAdler32();
542 
543   address generate_multiplyToLen();
544 
545   address generate_vectorizedMismatch();
546 
547   address generate_squareToLen();
548 
549   address generate_method_entry_barrier();
550 
551   address generate_mulAdd();
552 
553   address generate_bigIntegerRightShift();
554   address generate_bigIntegerLeftShift();
555 
556   address generate_float16ToFloat();
557   address generate_floatToFloat16();
558 
559   // Libm trigonometric stubs
560 
561   address generate_libmSin();
562   address generate_libmCos();
563   address generate_libmTan();
564   address generate_libmTanh();
565   address generate_libmExp();
566   address generate_libmPow();
567   address generate_libmLog();
568   address generate_libmLog10();
569   address generate_libmFmod();
570 
571   // Shared constants
572   static address ZERO;
573   static address NEG_ZERO;
574   static address ONE;
575   static address ONEHALF;
576   static address SIGN_MASK;
577   static address TWO_POW_55;
578   static address TWO_POW_M55;
579   static address SHIFTER;
580   static address PI32INV;
581   static address PI_INV_TABLE;
582   static address Ctable;
583   static address SC_1;
584   static address SC_2;
585   static address SC_3;
586   static address SC_4;
587   static address PI_4;
588   static address P_1;
589   static address P_3;
590   static address P_2;
591 
592   void generate_libm_stubs();
593 
594 #ifdef COMPILER2
595   void generate_string_indexof(address *fnptrs);
596 #endif
597 
598   address generate_cont_thaw(const char* label, Continuation::thaw_kind kind);
599   address generate_cont_thaw();
600 
601   // TODO: will probably need multiple return barriers depending on return type
602   address generate_cont_returnBarrier();
603   address generate_cont_returnBarrier_exception();
604 
605   // Continuation point for throwing of implicit exceptions that are
606   // not handled in the current activation. Fabricates an exception
607   // oop and initiates normal exception dispatching in this
608   // frame. Since we need to preserve callee-saved values (currently
609   // only for C2, but done for C1 as well) we need a callee-saved oop
610   // map and therefore have to make these stubs into RuntimeStubs
611   // rather than BufferBlobs.  If the compiler needs all registers to
612   // be preserved between the fault point and the exception handler
613   // then it must assume responsibility for that in
614   // AbstractCompiler::continuation_for_implicit_null_exception or
615   // continuation_for_implicit_division_by_zero_exception. All other
616   // implicit exceptions (e.g., NullPointerException or
617   // AbstractMethodError on entry) are either at call sites or
618   // otherwise assume that stack unwinding will be initiated, so
619   // caller saved registers were assumed volatile in the compiler.
620   address generate_throw_exception(const char* name,
621                                    address runtime_entry,
622                                    Register arg1 = noreg,
623                                    Register arg2 = noreg);
624 
625   // shared exception handler for FFM upcall stubs
626   address generate_upcall_stub_exception_handler();
627   address generate_upcall_stub_load_target();
628 
629   // Specialized stub implementations for UseSecondarySupersTable.
630   address generate_lookup_secondary_supers_table_stub(u1 super_klass_index);
631 
632   // Slow path implementation for UseSecondarySupersTable.
633   address generate_lookup_secondary_supers_table_slow_path_stub();
634 
635   void create_control_words();
636 
637   // Initialization
638   void generate_initial_stubs();
639   void generate_continuation_stubs();
640   void generate_compiler_stubs();
641   void generate_final_stubs();
642 
643  public:
644   StubGenerator(CodeBuffer* code, StubsKind kind);
645 };
646 
647 #endif // CPU_X86_STUBGENERATOR_X86_64_HPP