1 /* 2 * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #ifndef CPU_X86_STUBGENERATOR_X86_64_HPP 26 #define CPU_X86_STUBGENERATOR_X86_64_HPP 27 28 #include "code/codeBlob.hpp" 29 #include "runtime/continuation.hpp" 30 #include "runtime/stubCodeGenerator.hpp" 31 32 // Stub Code definitions 33 34 class StubGenerator: public StubCodeGenerator { 35 private: 36 37 // Call stubs are used to call Java from C. 38 address generate_call_stub(address& return_address); 39 40 // Return point for a Java call if there's an exception thrown in 41 // Java code. The exception is caught and transformed into a 42 // pending exception stored in JavaThread that can be tested from 43 // within the VM. 44 // 45 // Note: Usually the parameters are removed by the callee. In case 46 // of an exception crossing an activation frame boundary, that is 47 // not the case if the callee is compiled code => need to setup the 48 // rsp. 49 // 50 // rax: exception oop 51 52 address generate_catch_exception(); 53 54 // Continuation point for runtime calls returning with a pending 55 // exception. The pending exception check happened in the runtime 56 // or native call stub. The pending exception in Thread is 57 // converted into a Java-level exception. 58 // 59 // Contract with Java-level exception handlers: 60 // rax: exception 61 // rdx: throwing pc 62 // 63 // NOTE: At entry of this stub, exception-pc must be on stack !! 64 65 address generate_forward_exception(); 66 67 // Support for intptr_t OrderAccess::fence() 68 address generate_orderaccess_fence(); 69 70 // Support for intptr_t get_previous_sp() 71 // 72 // This routine is used to find the previous stack pointer for the 73 // caller. 74 address generate_get_previous_sp(); 75 76 //---------------------------------------------------------------------------------------------------- 77 // Support for void verify_mxcsr() 78 // 79 // This routine is used with -Xcheck:jni to verify that native 80 // JNI code does not return to Java code without restoring the 81 // MXCSR register to our expected state. 82 83 address generate_verify_mxcsr(); 84 85 address generate_f2i_fixup(); 86 address generate_f2l_fixup(); 87 address generate_d2i_fixup(); 88 address generate_d2l_fixup(); 89 90 address generate_count_leading_zeros_lut(const char *stub_name); 91 address generate_popcount_avx_lut(const char *stub_name); 92 address generate_iota_indices(const char *stub_name); 93 address generate_vector_reverse_bit_lut(const char *stub_name); 94 95 address generate_vector_reverse_byte_perm_mask_long(const char *stub_name); 96 address generate_vector_reverse_byte_perm_mask_int(const char *stub_name); 97 address generate_vector_reverse_byte_perm_mask_short(const char *stub_name); 98 address generate_vector_byte_shuffle_mask(const char *stub_name); 99 100 address generate_fp_mask(const char *stub_name, int64_t mask); 101 102 address generate_vector_mask(const char *stub_name, int64_t mask); 103 104 address generate_vector_byte_perm_mask(const char *stub_name); 105 106 address generate_vector_fp_mask(const char *stub_name, int64_t mask); 107 108 address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len, 109 int32_t val0, int32_t val1, int32_t val2, int32_t val3, 110 int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0, 111 int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0, 112 int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0); 113 114 // Non-destructive plausibility checks for oops 115 address generate_verify_oop(); 116 117 // Verify that a register contains clean 32-bits positive value 118 // (high 32-bits are 0) so it could be used in 64-bits shifts. 119 void assert_clean_int(Register Rint, Register Rtmp); 120 121 // Generate overlap test for array copy stubs 122 void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf); 123 124 void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) { 125 assert(no_overlap_target != NULL, "must be generated"); 126 array_overlap_test(no_overlap_target, NULL, sf); 127 } 128 void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) { 129 array_overlap_test(NULL, &L_no_overlap, sf); 130 } 131 132 133 // Shuffle first three arg regs on Windows into Linux/Solaris locations. 134 void setup_arg_regs(int nargs = 3); 135 void restore_arg_regs(); 136 137 #ifdef ASSERT 138 bool _regs_in_thread; 139 #endif 140 141 // This is used in places where r10 is a scratch register, and can 142 // be adapted if r9 is needed also. 143 void setup_arg_regs_using_thread(); 144 145 void restore_arg_regs_using_thread(); 146 147 // Copy big chunks forward 148 void copy_bytes_forward(Register end_from, Register end_to, 149 Register qword_count, Register to, 150 Label& L_copy_bytes, Label& L_copy_8_bytes); 151 152 // Copy big chunks backward 153 void copy_bytes_backward(Register from, Register dest, 154 Register qword_count, Register to, 155 Label& L_copy_bytes, Label& L_copy_8_bytes); 156 157 void setup_argument_regs(BasicType type); 158 159 void restore_argument_regs(BasicType type); 160 161 #if COMPILER2_OR_JVMCI 162 // Following rules apply to AVX3 optimized arraycopy stubs: 163 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs) 164 // for both special cases (various small block sizes) and aligned copy loop. This is the 165 // default configuration. 166 // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs) 167 // for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it. 168 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a 169 // better performance for disjoint copies. For conjoint/backward copy vector based 170 // copy performs better. 171 // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over 172 // 64 byte vector registers (ZMMs). 173 174 address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift, 175 bool aligned, bool is_oop, bool dest_uninitialized); 176 177 address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift, 178 address nooverlap_target, bool aligned, bool is_oop, 179 bool dest_uninitialized); 180 181 void arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from, 182 Register to, Register count, int shift, 183 Register index, Register temp, 184 bool use64byteVector, Label& L_entry, Label& L_exit); 185 186 void arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from, 187 Register to, Register start_index, Register end_index, 188 Register count, int shift, Register temp, 189 bool use64byteVector, Label& L_entry, Label& L_exit); 190 191 void copy32_avx(Register dst, Register src, Register index, XMMRegister xmm, 192 int shift = Address::times_1, int offset = 0); 193 194 void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm, 195 bool conjoint, int shift = Address::times_1, int offset = 0, 196 bool use64byteVector = false); 197 198 void copy64_masked_avx(Register dst, Register src, XMMRegister xmm, 199 KRegister mask, Register length, Register index, 200 Register temp, int shift = Address::times_1, int offset = 0, 201 bool use64byteVector = false); 202 203 void copy32_masked_avx(Register dst, Register src, XMMRegister xmm, 204 KRegister mask, Register length, Register index, 205 Register temp, int shift = Address::times_1, int offset = 0); 206 #endif // COMPILER2_OR_JVMCI 207 208 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name); 209 210 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 211 address* entry, const char *name); 212 213 address generate_disjoint_short_copy(bool aligned, address *entry, const char *name); 214 215 address generate_fill(BasicType t, bool aligned, const char *name); 216 217 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 218 address *entry, const char *name); 219 address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry, 220 const char *name, bool dest_uninitialized = false); 221 address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target, 222 address *entry, const char *name, 223 bool dest_uninitialized = false); 224 address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry, 225 const char *name, bool dest_uninitialized = false); 226 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, 227 address nooverlap_target, address *entry, 228 const char *name, bool dest_uninitialized = false); 229 230 // Helper for generating a dynamic type check. 231 // Smashes no registers. 232 void generate_type_check(Register sub_klass, 233 Register super_check_offset, 234 Register super_klass, 235 Label& L_success); 236 237 // Generate checkcasting array copy stub 238 address generate_checkcast_copy(const char *name, address *entry, 239 bool dest_uninitialized = false); 240 241 // Generate 'unsafe' array copy stub 242 // Though just as safe as the other stubs, it takes an unscaled 243 // size_t argument instead of an element count. 244 // 245 // Examines the alignment of the operands and dispatches 246 // to a long, int, short, or byte copy loop. 247 address generate_unsafe_copy(const char *name, 248 address byte_copy_entry, address short_copy_entry, 249 address int_copy_entry, address long_copy_entry); 250 251 // Perform range checks on the proposed arraycopy. 252 // Kills temp, but nothing else. 253 // Also, clean the sign bits of src_pos and dst_pos. 254 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 255 Register src_pos, // source position (c_rarg1) 256 Register dst, // destination array oo (c_rarg2) 257 Register dst_pos, // destination position (c_rarg3) 258 Register length, 259 Register temp, 260 Label& L_failed); 261 262 // Generate generic array copy stubs 263 address generate_generic_copy(const char *name, 264 address byte_copy_entry, address short_copy_entry, 265 address int_copy_entry, address oop_copy_entry, 266 address long_copy_entry, address checkcast_copy_entry); 267 268 address generate_data_cache_writeback(); 269 270 address generate_data_cache_writeback_sync(); 271 272 void generate_arraycopy_stubs(); 273 274 275 // MD5 stubs 276 277 // ofs and limit are use for multi-block byte array. 278 // int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs) 279 address generate_md5_implCompress(bool multi_block, const char *name); 280 281 282 // SHA stubs 283 284 // ofs and limit are use for multi-block byte array. 285 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 286 address generate_sha1_implCompress(bool multi_block, const char *name); 287 288 // ofs and limit are use for multi-block byte array. 289 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 290 address generate_sha256_implCompress(bool multi_block, const char *name); 291 address generate_sha512_implCompress(bool multi_block, const char *name); 292 293 // Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. 294 address generate_pshuffle_byte_flip_mask_sha512(); 295 296 address generate_upper_word_mask(); 297 address generate_shuffle_byte_flip_mask(); 298 address generate_pshuffle_byte_flip_mask(); 299 300 301 // AES intrinsic stubs 302 303 address generate_aescrypt_encryptBlock(); 304 305 address generate_aescrypt_decryptBlock(); 306 307 address generate_cipherBlockChaining_encryptAESCrypt(); 308 309 // A version of CBC/AES Decrypt which does 4 blocks in a loop at a time 310 // to hide instruction latency 311 address generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 312 313 address generate_electronicCodeBook_encryptAESCrypt(); 314 315 void aesecb_encrypt(Register source_addr, Register dest_addr, Register key, Register len); 316 317 address generate_electronicCodeBook_decryptAESCrypt(); 318 319 void aesecb_decrypt(Register source_addr, Register dest_addr, Register key, Register len); 320 321 // Vector AES Galois Counter Mode implementation 322 address generate_galoisCounterMode_AESCrypt(); 323 void aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key, 324 Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter); 325 326 327 // Vector AES Counter implementation 328 address generate_counterMode_VectorAESCrypt(); 329 void aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter, 330 Register len_reg, Register used, Register used_addr, Register saved_encCounter_start); 331 332 // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time 333 // to hide instruction latency 334 address generate_counterMode_AESCrypt_Parallel(); 335 336 address generate_cipherBlockChaining_decryptVectorAESCrypt(); 337 338 address generate_key_shuffle_mask(); 339 340 void roundDec(XMMRegister xmm_reg); 341 void roundDeclast(XMMRegister xmm_reg); 342 void roundEnc(XMMRegister key, int rnum); 343 void lastroundEnc(XMMRegister key, int rnum); 344 void roundDec(XMMRegister key, int rnum); 345 void lastroundDec(XMMRegister key, int rnum); 346 void gfmul_avx512(XMMRegister ghash, XMMRegister hkey); 347 void generateHtbl_48_block_zmm(Register htbl, Register avx512_subkeyHtbl, Register rscratch); 348 void ghash16_encrypt16_parallel(Register key, Register subkeyHtbl, XMMRegister ctr_blockx, 349 XMMRegister aad_hashx, Register in, Register out, Register data, Register pos, bool reduction, 350 XMMRegister addmask, bool no_ghash_input, Register rounds, Register ghash_pos, 351 bool final_reduction, int index, XMMRegister counter_inc_mask); 352 // Load key and shuffle operation 353 void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask); 354 void ev_load_key(XMMRegister xmmdst, Register key, int offset, Register rscratch); 355 356 // Utility routine for loading a 128-bit key word in little endian format 357 // can optionally specify that the shuffle mask is already in an xmmregister 358 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask); 359 void load_key(XMMRegister xmmdst, Register key, int offset, Register rscratch); 360 361 // Utility routine for increase 128bit counter (iv in CTR mode) 362 void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block); 363 364 void generate_aes_stubs(); 365 366 367 // GHASH stubs 368 369 void generate_ghash_stubs(); 370 371 void schoolbookAAD(int i, Register subkeyH, XMMRegister data, XMMRegister tmp0, 372 XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3); 373 void gfmul(XMMRegister tmp0, XMMRegister t); 374 void generateHtbl_one_block(Register htbl, Register rscratch); 375 void generateHtbl_eight_blocks(Register htbl); 376 void avx_ghash(Register state, Register htbl, Register data, Register blocks); 377 378 // Used by GHASH and AES stubs. 379 address ghash_polynomial_addr(); 380 address ghash_shufflemask_addr(); 381 address ghash_long_swap_mask_addr(); // byte swap x86 long 382 address ghash_byte_swap_mask_addr(); // byte swap x86 byte array 383 384 // Single and multi-block ghash operations 385 address generate_ghash_processBlocks(); 386 387 // Ghash single and multi block operations using AVX instructions 388 address generate_avx_ghash_processBlocks(); 389 390 // ChaCha20 stubs and helper functions 391 void generate_chacha_stubs(); 392 address generate_chacha20Block_avx(); 393 address generate_chacha20Block_avx512(); 394 void cc20_quarter_round_avx(XMMRegister aVec, XMMRegister bVec, 395 XMMRegister cVec, XMMRegister dVec, XMMRegister scratch, 396 XMMRegister lrot8, XMMRegister lrot16, int vector_len); 397 void cc20_shift_lane_org(XMMRegister bVec, XMMRegister cVec, 398 XMMRegister dVec, int vector_len, bool colToDiag); 399 void cc20_keystream_collate_avx512(XMMRegister aVec, XMMRegister bVec, 400 XMMRegister cVec, XMMRegister dVec, Register baseAddr, int baseOffset); 401 402 // Poly1305 multiblock using IFMA instructions 403 address generate_poly1305_processBlocks(); 404 void poly1305_process_blocks_avx512(const Register input, const Register length, 405 const Register A0, const Register A1, const Register A2, 406 const Register R0, const Register R1, const Register C1); 407 void poly1305_multiply_scalar(const Register a0, const Register a1, const Register a2, 408 const Register r0, const Register r1, const Register c1, bool only128, 409 const Register t0, const Register t1, const Register t2, 410 const Register mulql, const Register mulqh); 411 void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, 412 const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, 413 const XMMRegister P0L, const XMMRegister P0H, const XMMRegister P1L, const XMMRegister P1H, const XMMRegister P2L, const XMMRegister P2H, 414 const XMMRegister TMP, const Register rscratch); 415 void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, const Register t0, const Register t1); 416 void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs, const Register t0, const Register t1); 417 void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1, 418 const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, 419 const XMMRegister TMP, const Register rscratch); 420 421 // BASE64 stubs 422 423 address base64_shuffle_addr(); 424 address base64_avx2_shuffle_addr(); 425 address base64_avx2_input_mask_addr(); 426 address base64_avx2_lut_addr(); 427 address base64_encoding_table_addr(); 428 429 // Code for generating Base64 encoding. 430 // Intrinsic function prototype in Base64.java: 431 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) 432 address generate_base64_encodeBlock(); 433 434 // base64 AVX512vbmi tables 435 address base64_vbmi_lookup_lo_addr(); 436 address base64_vbmi_lookup_hi_addr(); 437 address base64_vbmi_lookup_lo_url_addr(); 438 address base64_vbmi_lookup_hi_url_addr(); 439 address base64_vbmi_pack_vec_addr(); 440 address base64_vbmi_join_0_1_addr(); 441 address base64_vbmi_join_1_2_addr(); 442 address base64_vbmi_join_2_3_addr(); 443 address base64_decoding_table_addr(); 444 address base64_AVX2_decode_tables_addr(); 445 address base64_AVX2_decode_LUT_tables_addr(); 446 447 // Code for generating Base64 decoding. 448 // 449 // Based on the article (and associated code) from https://arxiv.org/abs/1910.05109. 450 // 451 // Intrinsic function prototype in Base64.java: 452 // private void decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, isMIME); 453 address generate_base64_decodeBlock(); 454 455 address generate_updateBytesCRC32(); 456 address generate_updateBytesCRC32C(bool is_pclmulqdq_supported); 457 458 address generate_updateBytesAdler32(); 459 460 address generate_multiplyToLen(); 461 462 address generate_vectorizedMismatch(); 463 464 address generate_squareToLen(); 465 466 address generate_method_entry_barrier(); 467 468 address generate_check_lock_stack(); 469 470 address generate_mulAdd(); 471 472 address generate_bigIntegerRightShift(); 473 address generate_bigIntegerLeftShift(); 474 475 476 // Libm trigonometric stubs 477 478 address generate_libmSin(); 479 address generate_libmCos(); 480 address generate_libmTan(); 481 address generate_libmExp(); 482 address generate_libmPow(); 483 address generate_libmLog(); 484 address generate_libmLog10(); 485 486 // Shared constants 487 static address ZERO; 488 static address NEG_ZERO; 489 static address ONE; 490 static address ONEHALF; 491 static address SIGN_MASK; 492 static address TWO_POW_55; 493 static address TWO_POW_M55; 494 static address SHIFTER; 495 static address PI32INV; 496 static address PI_INV_TABLE; 497 static address Ctable; 498 static address SC_1; 499 static address SC_2; 500 static address SC_3; 501 static address SC_4; 502 static address PI_4; 503 static address P_1; 504 static address P_3; 505 static address P_2; 506 507 void generate_libm_stubs(); 508 509 510 address generate_cont_thaw(const char* label, Continuation::thaw_kind kind); 511 address generate_cont_thaw(); 512 513 // TODO: will probably need multiple return barriers depending on return type 514 address generate_cont_returnBarrier(); 515 address generate_cont_returnBarrier_exception(); 516 517 #if INCLUDE_JFR 518 519 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 520 // It returns a jobject handle to the event writer. 521 // The handle is dereferenced and the return value is the event writer oop. 522 RuntimeStub* generate_jfr_write_checkpoint(); 523 524 #endif // INCLUDE_JFR 525 526 // Continuation point for throwing of implicit exceptions that are 527 // not handled in the current activation. Fabricates an exception 528 // oop and initiates normal exception dispatching in this 529 // frame. Since we need to preserve callee-saved values (currently 530 // only for C2, but done for C1 as well) we need a callee-saved oop 531 // map and therefore have to make these stubs into RuntimeStubs 532 // rather than BufferBlobs. If the compiler needs all registers to 533 // be preserved between the fault point and the exception handler 534 // then it must assume responsibility for that in 535 // AbstractCompiler::continuation_for_implicit_null_exception or 536 // continuation_for_implicit_division_by_zero_exception. All other 537 // implicit exceptions (e.g., NullPointerException or 538 // AbstractMethodError on entry) are either at call sites or 539 // otherwise assume that stack unwinding will be initiated, so 540 // caller saved registers were assumed volatile in the compiler. 541 address generate_throw_exception(const char* name, 542 address runtime_entry, 543 Register arg1 = noreg, 544 Register arg2 = noreg); 545 546 void create_control_words(); 547 548 // Initialization 549 void generate_initial(); 550 void generate_phase1(); 551 void generate_all(); 552 553 public: 554 StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) { 555 DEBUG_ONLY( _regs_in_thread = false; ) 556 if (phase == 0) { 557 generate_initial(); 558 } else if (phase == 1) { 559 generate_phase1(); // stubs that must be available for the interpreter 560 } else { 561 generate_all(); 562 } 563 } 564 }; 565 566 #endif // CPU_X86_STUBGENERATOR_X86_64_HPP