< prev index next >

src/hotspot/cpu/x86/stubGenerator_x86_64.cpp

Print this page

  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "ci/ciUtilities.hpp"
  29 #include "compiler/oopMap.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/barrierSetNMethod.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "interpreter/interpreter.hpp"
  35 #include "memory/universe.hpp"
  36 #include "nativeInst_x86.hpp"
  37 #include "oops/instanceOop.hpp"
  38 #include "oops/method.hpp"
  39 #include "oops/objArrayKlass.hpp"
  40 #include "oops/oop.inline.hpp"

  41 #include "prims/methodHandles.hpp"
  42 #include "runtime/arguments.hpp"

  43 #include "runtime/frame.inline.hpp"
  44 #include "runtime/handles.inline.hpp"
  45 #include "runtime/sharedRuntime.hpp"
  46 #include "runtime/stubCodeGenerator.hpp"
  47 #include "runtime/stubRoutines.hpp"
  48 #include "runtime/thread.inline.hpp"
  49 #ifdef COMPILER2
  50 #include "opto/runtime.hpp"
  51 #endif
  52 #if INCLUDE_JVMCI
  53 #include "jvmci/jvmci_globals.hpp"
  54 #endif
  55 #if INCLUDE_ZGC
  56 #include "gc/z/zThreadLocalData.hpp"
  57 #endif



  58 
  59 // Declaration and definition of StubGenerator (no .hpp file).
  60 // For a more detailed description of the stub routine structure
  61 // see the comment in stubRoutines.hpp
  62 
  63 #define __ _masm->
  64 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  65 #define a__ ((Assembler*)_masm)->
  66 
  67 #ifdef PRODUCT
  68 #define BLOCK_COMMENT(str) /* nothing */
  69 #else
  70 #define BLOCK_COMMENT(str) __ block_comment(str)
  71 #endif
  72 
  73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  74 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  75 




  76 // Stub Code definitions
  77 
  78 class StubGenerator: public StubCodeGenerator {
  79  private:
  80 
  81 #ifdef PRODUCT
  82 #define inc_counter_np(counter) ((void)0)
  83 #else
  84   void inc_counter_np_(int& counter) {
  85     // This can destroy rscratch1 if counter is far from the code cache
  86     __ incrementl(ExternalAddress((address)&counter));
  87   }
  88 #define inc_counter_np(counter) \
  89   BLOCK_COMMENT("inc_counter " #counter); \
  90   inc_counter_np_(counter);
  91 #endif
  92 
  93   // Call stubs are used to call Java from C
  94   //
  95   // Linux Arguments:

 366 #ifdef ASSERT
 367     // verify that threads correspond
 368     {
 369      Label L1, L2, L3;
 370       __ cmpptr(r15_thread, thread);
 371       __ jcc(Assembler::equal, L1);
 372       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 373       __ bind(L1);
 374       __ get_thread(rbx);
 375       __ cmpptr(r15_thread, thread);
 376       __ jcc(Assembler::equal, L2);
 377       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 378       __ bind(L2);
 379       __ cmpptr(r15_thread, rbx);
 380       __ jcc(Assembler::equal, L3);
 381       __ stop("StubRoutines::call_stub: threads must correspond");
 382       __ bind(L3);
 383     }
 384 #endif
 385 


 386     // restore regs belonging to calling function
 387 #ifdef _WIN64
 388     // emit the restores for xmm regs
 389     if (VM_Version::supports_evex()) {
 390       for (int i = xmm_save_first; i <= last_reg; i++) {
 391         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 392       }
 393     } else {
 394       for (int i = xmm_save_first; i <= last_reg; i++) {
 395         __ movdqu(as_XMMRegister(i), xmm_save(i));
 396       }
 397     }
 398 #endif
 399     __ movptr(r15, r15_save);
 400     __ movptr(r14, r14_save);
 401     __ movptr(r13, r13_save);
 402     __ movptr(r12, r12_save);
 403     __ movptr(rbx, rbx_save);
 404 
 405 #ifdef _WIN64

1868       __ movb(Address(end_to, 8), rax);
1869     }
1870   __ BIND(L_exit);
1871     address ucme_exit_pc = __ pc();
1872     restore_arg_regs();
1873     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1874     __ xorptr(rax, rax); // return 0
1875     __ vzeroupper();
1876     __ leave(); // required for proper stackwalking of RuntimeStub frame
1877     __ ret(0);
1878 
1879     {
1880       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1881       // Copy in multi-bytes chunks
1882       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1883       __ jmp(L_copy_4_bytes);
1884     }
1885     return start;
1886   }
1887 

















































































































































































































































































































































































































































1888   // Arguments:
1889   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1890   //             ignored
1891   //   name    - stub name string
1892   //
1893   // Inputs:
1894   //   c_rarg0   - source array address
1895   //   c_rarg1   - destination array address
1896   //   c_rarg2   - element count, treated as ssize_t, can be zero
1897   //
1898   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1899   // we let the hardware handle it.  The one to eight bytes within words,
1900   // dwords or qwords that span cache line boundaries will still be loaded
1901   // and stored atomically.
1902   //
1903   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1904                                       address* entry, const char *name) {
1905 #if COMPILER2_OR_JVMCI
1906     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1907        return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,

3430 
3431     // We don't generate specialized code for HeapWord-aligned source
3432     // arrays, so just use the code we've already generated
3433     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3434     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3435 
3436     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3437     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3438 
3439     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3440     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3441 
3442     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3443     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3444 
3445     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3446     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3447 
3448     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3449     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;






3450   }
3451 
3452   // AES intrinsic stubs
3453   enum {AESBlockSize = 16};
3454 
3455   address generate_key_shuffle_mask() {
3456     __ align(16);
3457     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3458     address start = __ pc();
3459     __ emit_data64( 0x0405060700010203, relocInfo::none );
3460     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3461     return start;
3462   }
3463 
3464   address generate_counter_shuffle_mask() {
3465     __ align(16);
3466     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3467     address start = __ pc();
3468     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3469     __ emit_data64(0x0001020304050607, relocInfo::none);

7400     __ enter(); // required for proper stackwalking of RuntimeStub frame
7401 
7402 #ifdef _WIN64
7403     __ push(rsi);
7404     __ push(rdi);
7405 #endif
7406     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7407 
7408 #ifdef _WIN64
7409     __ pop(rdi);
7410     __ pop(rsi);
7411 #endif
7412 
7413     __ leave(); // required for proper stackwalking of RuntimeStub frame
7414     __ ret(0);
7415 
7416     return start;
7417 
7418   }
7419 

















































































































































































































































































7420 #undef __
7421 #define __ masm->
7422 
7423   // Continuation point for throwing of implicit exceptions that are
7424   // not handled in the current activation. Fabricates an exception
7425   // oop and initiates normal exception dispatching in this
7426   // frame. Since we need to preserve callee-saved values (currently
7427   // only for C2, but done for C1 as well) we need a callee-saved oop
7428   // map and therefore have to make these stubs into RuntimeStubs
7429   // rather than BufferBlobs.  If the compiler needs all registers to
7430   // be preserved between the fault point and the exception handler
7431   // then it must assume responsibility for that in
7432   // AbstractCompiler::continuation_for_implicit_null_exception or
7433   // continuation_for_implicit_division_by_zero_exception. All other
7434   // implicit exceptions (e.g., NullPointerException or
7435   // AbstractMethodError on entry) are either at call sites or
7436   // otherwise assume that stack unwinding will be initiated, so
7437   // caller saved registers were assumed volatile in the compiler.
7438   address generate_throw_exception(const char* name,
7439                                    address runtime_entry,

7628       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7629         StubRoutines::_dsin = generate_libmSin();
7630       }
7631       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7632         StubRoutines::_dcos = generate_libmCos();
7633       }
7634       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7635         StubRoutines::_dtan = generate_libmTan();
7636       }
7637     }
7638 
7639     // Safefetch stubs.
7640     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
7641                                                        &StubRoutines::_safefetch32_fault_pc,
7642                                                        &StubRoutines::_safefetch32_continuation_pc);
7643     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7644                                                        &StubRoutines::_safefetchN_fault_pc,
7645                                                        &StubRoutines::_safefetchN_continuation_pc);
7646   }
7647 














7648   void generate_all() {
7649     // Generates all stubs and initializes the entry points
7650 
7651     // These entry points require SharedInfo::stack0 to be set up in
7652     // non-core builds and need to be relocatable, so they each
7653     // fabricate a RuntimeStub internally.
7654     StubRoutines::_throw_AbstractMethodError_entry =
7655       generate_throw_exception("AbstractMethodError throw_exception",
7656                                CAST_FROM_FN_PTR(address,
7657                                                 SharedRuntime::
7658                                                 throw_AbstractMethodError));
7659 
7660     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7661       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7662                                CAST_FROM_FN_PTR(address,
7663                                                 SharedRuntime::
7664                                                 throw_IncompatibleClassChangeError));
7665 
7666     StubRoutines::_throw_NullPointerException_at_call_entry =
7667       generate_throw_exception("NullPointerException at call throw_exception",

7885         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libsvml, ebuf);
7886 
7887         snprintf(ebuf, sizeof(ebuf), "__svml_%s1_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7888         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsvml, ebuf);
7889 
7890         snprintf(ebuf, sizeof(ebuf), "__svml_%s2_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7891         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsvml, ebuf);
7892 
7893         snprintf(ebuf, sizeof(ebuf), "__svml_%s4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7894         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libsvml, ebuf);
7895       }
7896     }
7897 #endif // COMPILER2
7898 
7899     if (UseVectorizedMismatchIntrinsic) {
7900       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
7901     }
7902   }
7903 
7904  public:
7905   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7906     if (all) {
7907       generate_all();
7908     } else {
7909       generate_initial();




7910     }
7911   }
7912 }; // end class declaration
7913 
7914 #define UCM_TABLE_MAX_ENTRIES 16
7915 void StubGenerator_generate(CodeBuffer* code, bool all) {
7916   if (UnsafeCopyMemory::_table == NULL) {
7917     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7918   }
7919   StubGenerator g(code, all);










































7920 }

























  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "ci/ciUtilities.hpp"
  29 #include "compiler/oopMap.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/barrierSetNMethod.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "interpreter/interpreter.hpp"
  35 #include "memory/universe.hpp"
  36 #include "nativeInst_x86.hpp"
  37 #include "oops/instanceOop.hpp"
  38 #include "oops/method.hpp"
  39 #include "oops/objArrayKlass.hpp"
  40 #include "oops/oop.inline.hpp"
  41 #include "prims/jvmtiExport.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/arguments.hpp"
  44 #include "runtime/continuation.hpp"
  45 #include "runtime/frame.inline.hpp"
  46 #include "runtime/handles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubCodeGenerator.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "runtime/thread.inline.hpp"
  51 #ifdef COMPILER2
  52 #include "opto/runtime.hpp"
  53 #endif
  54 #if INCLUDE_JVMCI
  55 #include "jvmci/jvmci_globals.hpp"
  56 #endif
  57 #if INCLUDE_ZGC
  58 #include "gc/z/zThreadLocalData.hpp"
  59 #endif
  60 #if INCLUDE_JFR
  61 #include "jfr/support/jfrIntrinsics.hpp"
  62 #endif
  63 
  64 // Declaration and definition of StubGenerator (no .hpp file).
  65 // For a more detailed description of the stub routine structure
  66 // see the comment in stubRoutines.hpp
  67 
  68 #define __ _masm->
  69 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  70 #define a__ ((Assembler*)_masm)->
  71 
  72 #ifdef PRODUCT
  73 #define BLOCK_COMMENT(str) /* nothing */
  74 #else
  75 #define BLOCK_COMMENT(str) __ block_comment(str)
  76 #endif
  77 
  78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  79 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  80 
  81 OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots);
  82 void fill_continuation_entry(MacroAssembler* masm);
  83 void continuation_enter_cleanup(MacroAssembler* masm);
  84 
  85 // Stub Code definitions
  86 
  87 class StubGenerator: public StubCodeGenerator {
  88  private:
  89 
  90 #ifdef PRODUCT
  91 #define inc_counter_np(counter) ((void)0)
  92 #else
  93   void inc_counter_np_(int& counter) {
  94     // This can destroy rscratch1 if counter is far from the code cache
  95     __ incrementl(ExternalAddress((address)&counter));
  96   }
  97 #define inc_counter_np(counter) \
  98   BLOCK_COMMENT("inc_counter " #counter); \
  99   inc_counter_np_(counter);
 100 #endif
 101 
 102   // Call stubs are used to call Java from C
 103   //
 104   // Linux Arguments:

 375 #ifdef ASSERT
 376     // verify that threads correspond
 377     {
 378      Label L1, L2, L3;
 379       __ cmpptr(r15_thread, thread);
 380       __ jcc(Assembler::equal, L1);
 381       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 382       __ bind(L1);
 383       __ get_thread(rbx);
 384       __ cmpptr(r15_thread, thread);
 385       __ jcc(Assembler::equal, L2);
 386       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 387       __ bind(L2);
 388       __ cmpptr(r15_thread, rbx);
 389       __ jcc(Assembler::equal, L3);
 390       __ stop("StubRoutines::call_stub: threads must correspond");
 391       __ bind(L3);
 392     }
 393 #endif
 394 
 395     __ pop_cont_fastpath(r15_thread);
 396 
 397     // restore regs belonging to calling function
 398 #ifdef _WIN64
 399     // emit the restores for xmm regs
 400     if (VM_Version::supports_evex()) {
 401       for (int i = xmm_save_first; i <= last_reg; i++) {
 402         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 403       }
 404     } else {
 405       for (int i = xmm_save_first; i <= last_reg; i++) {
 406         __ movdqu(as_XMMRegister(i), xmm_save(i));
 407       }
 408     }
 409 #endif
 410     __ movptr(r15, r15_save);
 411     __ movptr(r14, r14_save);
 412     __ movptr(r13, r13_save);
 413     __ movptr(r12, r12_save);
 414     __ movptr(rbx, rbx_save);
 415 
 416 #ifdef _WIN64

1879       __ movb(Address(end_to, 8), rax);
1880     }
1881   __ BIND(L_exit);
1882     address ucme_exit_pc = __ pc();
1883     restore_arg_regs();
1884     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1885     __ xorptr(rax, rax); // return 0
1886     __ vzeroupper();
1887     __ leave(); // required for proper stackwalking of RuntimeStub frame
1888     __ ret(0);
1889 
1890     {
1891       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1892       // Copy in multi-bytes chunks
1893       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1894       __ jmp(L_copy_4_bytes);
1895     }
1896     return start;
1897   }
1898 
1899   // Fast memory copying for continuations
1900   // See:
1901   // - Intel 64 and IA-32 Architectures Optimization Reference Manual: (https://software.intel.com/sites/default/files/managed/9e/bc/64-ia-32-architectures-optimization-manual.pdf)
1902   //   - 2.7.6 REP String Enhancement
1903   //   - 3.7.5 REP Prefix and Data Movement
1904   //   - 3.7.6 Enhanced REP MOVSB and STOSB Operation
1905   //   - 8.1 GENERAL PREFETCH CODING GUIDELINES
1906   //   - 8.4.1.2 Streaming Non-temporal Stores, 8.4.1.3 Memory Type and Non-temporal Stores
1907   //   - 8.5 MEMORY OPTIMIZATION USING PREFETCH, 8.5.6 Software Prefetch Scheduling Distance, 8.5.7 Software Prefetch Concatenation
1908   //   - 14.3, MIXING AVX CODE WITH SSE CODE + https://software.intel.com/en-us/articles/intel-avx-state-transitions-migrating-sse-code-to-avx
1909   // - Optimizing subroutines in assembly language, 17.9 Moving blocks of data https://www.agner.org/optimize/optimizing_assembly.pdf
1910   // - StackOverflow
1911   //   - https://stackoverflow.com/q/26246040/750563 What's missing/sub-optimal in this memcpy implementation?
1912   //   - https://stackoverflow.com/q/43343231/750563 Enhanced REP MOVSB for memcpy
1913   //   - https://stackoverflow.com/q/33902068/750563 What setup does REP do?
1914   //   - https://stackoverflow.com/q/8858778/750563  Why are complicated memcpy/memset superior?
1915   //   - https://stackoverflow.com/q/1715224/750563  Very fast memcpy for image processing?
1916   //   - https://stackoverflow.com/q/17312823/750563 When program will benefit from prefetch & non-temporal load/store?
1917   //   - https://stackoverflow.com/q/40096894/750563 Do current x86 architectures support non-temporal loads (from “normal” memory)?
1918   //   - https://stackoverflow.com/q/32103968/750563 Non-temporal loads and the hardware prefetcher, do they work together?
1919   // - https://docs.roguewave.com/threadspotter/2011.2/manual_html_linux/manual_html/ch05s03.html Non-Temporal Data
1920   // - https://blogs.fau.de/hager/archives/2103 A case for the non-temporal store
1921   // - https://vgatherps.github.io/2018-09-02-nontemporal/ Optimizing Cache Usage With Nontemporal Accesses
1922   // - https://www.reddit.com/r/cpp/comments/9ccb88/optimizing_cache_usage_with_nontemporal_accesses/
1923   // - https://lwn.net/Articles/255364/ Memory part 5: What programmers can do
1924   // - https://software.intel.com/en-us/forums/intel-isa-extensions/topic/597075 Do Non-Temporal Loads Prefetch?
1925   // - https://software.intel.com/en-us/forums/intel-fortran-compiler/topic/275765#comment-1551057 Time to revisit REP;MOVS
1926   // - https://lemire.me/blog/2018/09/07/avx-512-when-and-how-to-use-these-new-instructions/ AVX-512: when and how to use these new instructions (explains AVX3Threshold)
1927   // - https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html Gathering Intel on Intel AVX-512 Transitions
1928 
1929 
1930   // Used by continuations to copy from stack
1931   // Arguments:
1932   //   name - stub name string
1933   //   nt   -  use non-temporal stores
1934   //
1935   // Inputs:
1936   //   c_rarg0   - source array address       -- 16-byte aligned
1937   //   c_rarg1   - destination array address  --  8-byte aligned
1938   //   c_rarg2   - element count, in qwords (8 bytes), >= 2
1939   //
1940   address generate_disjoint_word_copy_up(bool nt, const char *name) {
1941     const bool align = nt;
1942 
1943     __ align(CodeEntryAlignment);
1944     StubCodeMark mark(this, "StubRoutines", name);
1945     address start = __ pc();
1946 
1947     Label L_copy_bytes, L_copy_8_bytes, L_loop, L_end, L_exit;
1948     const Register from        = rdi;  // source array address
1949     const Register to          = rsi;  // destination array address
1950     const Register count       = rdx;  // elements count
1951     const Register qword_count = count;
1952     const Register end_from    = from; // source array end address
1953     const Register end_to      = to;   // destination array end address
1954     const Register alignment   = rcx;
1955 
1956     // End pointers are inclusive, and if count is not zero they point
1957     // to the last unit copied:  end_to[0] := end_from[0]
1958 
1959     __ enter(); // required for proper stackwalking of RuntimeStub frame
1960     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1961 
1962     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1963                       // r9 and r10 may be used to save non-volatile registers
1964 
1965     // Copy from low to high addresses.
1966     // By pointing to the end and negating qword_count we:
1967     // 1. only update count, not from/tp; 2. don't need another register to hold total count; 3. can jcc right after addptr without cmpptr
1968 
1969     // __ movptr(alignment, to);
1970     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1971     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1972     __ negptr(qword_count); // make the count negative
1973     // Address(end_from/to, qword_count, Address::times_8) now points 8 bytes *below* to original from/to
1974     // i.e. orig to == Address(end_to, qword_count, Address::times_8, 8)
1975 
1976     // Copy in multi-bytes chunks
1977 
1978     if (UseUnalignedLoadStores) {
1979       if (align) { // align target
1980         NearLabel L_aligned_128, L_aligned_256, L_aligned_512;
1981 
1982         __ lea(alignment, Address(end_to, qword_count, Address::times_8, 8)); // == original to
1983         __ negptr(alignment); // we align by copying from the beginning of to, making it effectively larger
1984 
1985         __ testl(alignment, 8);
1986         __ jccb(Assembler::zero, L_aligned_128);
1987         __ increment(qword_count);
1988         // no need to test because we know qword_count >= 2
1989         __ movq(rax, Address(end_from, qword_count, Address::times_8, -0));
1990         __ movqa(Address(end_to, qword_count, Address::times_8, -0), rax, nt);
1991         __ bind(L_aligned_128);
1992 
1993         if (UseAVX >= 2) {
1994           __ testl(alignment, 16);
1995           __ jccb(Assembler::zero, L_aligned_256);
1996           __ cmpptr(qword_count, -2);
1997           if (UseAVX > 2) {
1998             __ jcc(Assembler::greater, L_copy_8_bytes);
1999           } else {
2000             __ jccb(Assembler::greater, L_copy_8_bytes);
2001           }
2002           __ addptr(qword_count, 2);
2003           __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -8));
2004           __ movdqa(Address(end_to, qword_count, Address::times_8, -8), xmm0, nt);
2005           __ bind(L_aligned_256);
2006           // we can move from SSE to AVX without penalty, but not the other way around
2007         }
2008 
2009         if (UseAVX > 2) {
2010           __ testl(alignment, 32);
2011           __ jccb(Assembler::zero, L_aligned_512);
2012           __ addptr(qword_count, 4);
2013           __ jccb(Assembler::less, L_end);
2014           __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
2015           __ vmovdqa(Address(end_to, qword_count, Address::times_8, -24), xmm0, nt);
2016           __ bind(L_aligned_512);
2017         }
2018       }
2019 
2020       // Copy 64-bytes per iteration
2021       if (UseAVX > 2) {
2022         Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
2023 
2024         __ BIND(L_copy_bytes);
2025         __ cmpptr(qword_count, (-1 * AVX3Threshold / 8));
2026         __ jccb(Assembler::less, L_above_threshold);
2027         __ jmpb(L_below_threshold);
2028 
2029         __ align(OptoLoopAlignment);
2030         __ bind(L_loop_avx512);
2031         __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
2032         __ evmovdqa(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit, nt);
2033         __ bind(L_above_threshold);
2034         __ addptr(qword_count, 8);
2035         __ jcc(Assembler::lessEqual, L_loop_avx512);
2036         __ jmpb(L_32_byte_head);
2037 
2038         __ bind(L_loop_avx2);
2039         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
2040         __ vmovdqa(Address(end_to, qword_count, Address::times_8, -56), xmm0, nt);
2041         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
2042         __ vmovdqa(Address(end_to, qword_count, Address::times_8, -24), xmm1, nt);
2043         __ bind(L_below_threshold);
2044         __ addptr(qword_count, 8);
2045         __ jcc(Assembler::lessEqual, L_loop_avx2);
2046 
2047         __ bind(L_32_byte_head);
2048         __ subptr(qword_count, 4);  // sub(8) and add(4)
2049         __ jccb(Assembler::greater, L_end);
2050       } else {
2051         __ jmp(L_copy_bytes);
2052         __ align(OptoLoopAlignment);
2053         __ BIND(L_loop);
2054         if (UseAVX == 2) {
2055           __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
2056           __ vmovdqa(Address(end_to, qword_count, Address::times_8, -56), xmm0, nt);
2057           __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
2058           __ vmovdqa(Address(end_to, qword_count, Address::times_8, -24), xmm1, nt);
2059         } else {
2060           __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
2061           __ movdqa(Address(end_to, qword_count, Address::times_8, -56), xmm0, nt);
2062           __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
2063           __ movdqa(Address(end_to, qword_count, Address::times_8, -40), xmm1, nt);
2064           __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
2065           __ movdqa(Address(end_to, qword_count, Address::times_8, -24), xmm2, nt);
2066           __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
2067           __ movdqa(Address(end_to, qword_count, Address::times_8, - 8), xmm3, nt);
2068         }
2069 
2070         __ BIND(L_copy_bytes);
2071         __ addptr(qword_count, 8);
2072         __ jcc(Assembler::lessEqual, L_loop);
2073         __ subptr(qword_count, 4);  // sub(8) and add(4); we added the extra 8 at the end of the loop; we'll subtract the extra 4 right before "copy trailing qwords"
2074         __ jccb(Assembler::greater, L_end);
2075       }
2076       // Copy trailing 32 bytes
2077       if (UseAVX >= 2) {
2078         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
2079         __ vmovdqa(Address(end_to, qword_count, Address::times_8, -24), xmm0, nt);
2080       } else {
2081         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
2082         __ movdqa(Address(end_to, qword_count, Address::times_8, -24), xmm0, nt);
2083         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
2084         __ movdqa(Address(end_to, qword_count, Address::times_8, - 8), xmm1, nt);
2085       }
2086       __ addptr(qword_count, 4);
2087     } else {
2088       // Copy 32-bytes per iteration
2089       __ jmp(L_copy_bytes);
2090       __ align(OptoLoopAlignment);
2091       __ BIND(L_loop);
2092       __ movq(rax, Address(end_from, qword_count, Address::times_8, -24));
2093       __ movqa(Address(end_to, qword_count, Address::times_8, -24), rax, nt);
2094       __ movq(rax, Address(end_from, qword_count, Address::times_8, -16));
2095       __ movqa(Address(end_to, qword_count, Address::times_8, -16), rax, nt);
2096       __ movq(rax, Address(end_from, qword_count, Address::times_8, - 8));
2097       __ movqa(Address(end_to, qword_count, Address::times_8, - 8), rax, nt);
2098       __ movq(rax, Address(end_from, qword_count, Address::times_8, - 0));
2099       __ movqa(Address(end_to, qword_count, Address::times_8, - 0), rax, nt);
2100 
2101       __ BIND(L_copy_bytes);
2102       __ addptr(qword_count, 4);
2103       __ jcc(Assembler::lessEqual, L_loop);
2104     }
2105     __ BIND(L_end);
2106     __ subptr(qword_count, 4);
2107     __ jccb(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
2108 
2109     __ BIND(L_exit);
2110     restore_arg_regs();
2111     __ xorptr(rax, rax); // return 0
2112     __ vzeroupper();
2113     __ leave(); // required for proper stackwalking of RuntimeStub frame
2114     __ ret(0);
2115 
2116     // Copy trailing qwords
2117     __ BIND(L_copy_8_bytes);
2118     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2119     __ movqa(Address(end_to, qword_count, Address::times_8, 8), rax, nt);
2120     __ increment(qword_count);
2121     __ jcc(Assembler::notZero, L_copy_8_bytes);
2122     __ jmp(L_exit);
2123 
2124     return start;
2125   }
2126 
2127   // Used by continuations to copy to stack
2128   // Arguments:
2129   //   name    - stub name string
2130   //   nt_mode - 0 - none, 1 - use non-temporal prefetches, 2 - use non-temporal loads
2131   //
2132   // Inputs:
2133   //   c_rarg0   - source array address      --  8-byte aligned
2134   //   c_rarg1   - destination array address -- 16-byte aligned
2135   //   c_rarg2   - element count, in qwords (8 bytes), >= 2
2136   //
2137   address generate_disjoint_word_copy_down(int nt_mode, const char *name) {
2138     const bool prefetchnt = (nt_mode == 1);
2139     const bool nt         = (nt_mode == 2);
2140     const bool align      = nt;
2141 
2142     __ align(CodeEntryAlignment);
2143     StubCodeMark mark(this, "StubRoutines", name);
2144     address start = __ pc();
2145 
2146     Label L_copy_bytes, L_copy_8_bytes, L_loop, L_end, L_exit;
2147     const Register from        = rdi;  // source array address
2148     const Register to          = rsi;  // destination array address
2149     const Register count       = rdx;  // elements count
2150     const Register qword_count = count;
2151     const Register alignment   = rcx; // rbx causes trouble
2152 
2153     __ enter(); // required for proper stackwalking of RuntimeStub frame
2154     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2155 
2156     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2157                       // r9 and r10 may be used to save non-volatile registers
2158 
2159     // Copy from high to low addresses.
2160 
2161     // Copy in multi-bytes chunks
2162 
2163     if (UseUnalignedLoadStores) {
2164       if (align) { // align source (only useful for nt)
2165         NearLabel L_aligned_128, L_aligned_256, L_aligned_512;
2166 
2167         __ lea(alignment, Address(from, qword_count, Address::times_8, 0)); // == original to
2168 
2169         __ testl(alignment, 8);
2170         __ jccb(Assembler::zero, L_aligned_128);
2171         __ decrement(qword_count);
2172         // no need to test because we know qword_count >= 2
2173         __ movdqa(xmm0, Address(from, qword_count, Address::times_8, 0), nt); // no 8-byte nt load
2174         __ psrldq(xmm0, 8); // movlhps(xmm0, xmm0);
2175         __ movdq(rax, xmm0);
2176         // __ movq(rax, Address(from, qword_count, Address::times_8, 0));
2177         __ movq(Address(to, qword_count, Address::times_8, 0), rax);
2178         __ bind(L_aligned_128);
2179 
2180         if (UseAVX >= 2) {
2181           __ testl(alignment, 16);
2182           __ jccb(Assembler::zero, L_aligned_256);
2183           __ cmpptr(qword_count, 2);
2184           if (UseAVX > 2) {
2185             __ jcc(Assembler::less, L_copy_8_bytes);
2186           } else {
2187             __ jccb(Assembler::less, L_copy_8_bytes);
2188           }
2189           __ subptr(qword_count, 2);
2190           __ movdqa(xmm0, Address(from, qword_count, Address::times_8, 0), nt);
2191           __ movdqu(Address(to, qword_count, Address::times_8, 0), xmm0);
2192           __ bind(L_aligned_256);
2193           // we can move from SSE to AVX without penalty, but not the other way around
2194         }
2195 
2196         if (UseAVX > 2) {
2197           __ testl(alignment, 32);
2198           __ jccb(Assembler::zero, L_aligned_512);
2199           __ subptr(qword_count, 4);
2200           __ jccb(Assembler::less, L_end);
2201           __ vmovdqa(xmm0, Address(from, qword_count, Address::times_8, 0), nt);
2202           __ vmovdqu(Address(to, qword_count, Address::times_8, 0), xmm0);
2203           __ bind(L_aligned_512);
2204         }
2205       }
2206 
2207       // Copy 64-bytes per iteration
2208       const int prefetch_distance = 2 * 64; // prefetch distance of 2
2209       if (UseAVX > 2) {
2210         Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
2211 
2212         __ BIND(L_copy_bytes);
2213         __ cmpptr(qword_count, (AVX3Threshold / 8));
2214         __ jccb(Assembler::greater, L_above_threshold);
2215         __ jmpb(L_below_threshold);
2216 
2217         __ align(OptoLoopAlignment);
2218         __ BIND(L_loop_avx512);
2219         if (prefetchnt) {
2220           __ prefetchnta(Address(from, qword_count, Address::times_8, -prefetch_distance));
2221         }
2222         __ evmovdqa(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit, nt);
2223         __ evmovdqul(Address(to, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
2224         __ bind(L_above_threshold);
2225         __ subptr(qword_count, 8);
2226         __ jcc(Assembler::greaterEqual, L_loop_avx512);
2227         __ jmpb(L_32_byte_head);
2228 
2229         __ bind(L_loop_avx2);
2230         if (prefetchnt) {
2231           __ prefetchnta(Address(from, qword_count, Address::times_8, -prefetch_distance));
2232         }
2233         __ vmovdqa(xmm0, Address(from, qword_count, Address::times_8, 32), nt);
2234         __ vmovdqu(Address(to, qword_count, Address::times_8, 32), xmm0);
2235         __ vmovdqa(xmm1, Address(from, qword_count, Address::times_8, 0), nt);
2236         __ vmovdqu(Address(to, qword_count, Address::times_8, 0), xmm1);
2237         __ bind(L_below_threshold);
2238         __ subptr(qword_count, 8);
2239         __ jcc(Assembler::greaterEqual, L_loop_avx2);
2240 
2241         __ bind(L_32_byte_head);
2242         __ addptr(qword_count, 4);  // add(8) and sub(4)
2243         __ jccb(Assembler::less, L_end);
2244       } else {
2245         __ jmp(L_copy_bytes);
2246         __ align(OptoLoopAlignment);
2247         __ BIND(L_loop);
2248         if (prefetchnt) {
2249           __ prefetchnta(Address(from, qword_count, Address::times_8, -prefetch_distance));
2250         }
2251         if (UseAVX == 2) {
2252           __ vmovdqa(xmm0, Address(from, qword_count, Address::times_8, 32), nt);
2253           __ vmovdqu(Address(to, qword_count, Address::times_8, 32), xmm0);
2254           __ vmovdqa(xmm1, Address(from, qword_count, Address::times_8,  0), nt);
2255           __ vmovdqu(Address(to, qword_count, Address::times_8,  0), xmm1);
2256         } else {
2257           __ movdqa(xmm0, Address(from, qword_count, Address::times_8, 48), nt);
2258           __ movdqu(Address(to, qword_count, Address::times_8, 48), xmm0);
2259           __ movdqa(xmm1, Address(from, qword_count, Address::times_8, 32), nt);
2260           __ movdqu(Address(to, qword_count, Address::times_8, 32), xmm1);
2261           __ movdqa(xmm2, Address(from, qword_count, Address::times_8, 16), nt);
2262           __ movdqu(Address(to, qword_count, Address::times_8, 16), xmm2);
2263           __ movdqa(xmm3, Address(from, qword_count, Address::times_8,  0), nt);
2264           __ movdqu(Address(to, qword_count, Address::times_8,  0), xmm3);
2265         }
2266 
2267         __ BIND(L_copy_bytes);
2268         __ subptr(qword_count, 8);
2269         __ jcc(Assembler::greaterEqual, L_loop);
2270 
2271         __ addptr(qword_count, 4);  // add(8) and sub(4)
2272         __ jccb(Assembler::less, L_end);
2273       }
2274       // Copy trailing 32 bytes
2275       if (UseAVX >= 2) {
2276         __ vmovdqa(xmm0, Address(from, qword_count, Address::times_8, 0), nt);
2277         __ vmovdqu(Address(to, qword_count, Address::times_8, 0), xmm0);
2278       } else {
2279         __ movdqa(xmm0, Address(from, qword_count, Address::times_8, 16), nt);
2280         __ movdqu(Address(to, qword_count, Address::times_8, 16), xmm0);
2281         __ movdqa(xmm1, Address(from, qword_count, Address::times_8,  0), nt);
2282         __ movdqu(Address(to, qword_count, Address::times_8,  0), xmm1);
2283       }
2284       __ subptr(qword_count, 4);
2285     } else {
2286       // Copy 32-bytes per iteration
2287       const int prefetch_distance = 4 * 32; // prefetch distance of 4
2288       __ jmp(L_copy_bytes);
2289       __ align(OptoLoopAlignment);
2290       __ BIND(L_loop);
2291       if (prefetchnt) {
2292         __ prefetchnta(Address(from, qword_count, Address::times_8, -prefetch_distance));
2293       }
2294       __ movq(rax, Address(from, qword_count, Address::times_8, 24));
2295       __ movq(Address(to, qword_count, Address::times_8, 24), rax);
2296       __ movq(rax, Address(from, qword_count, Address::times_8, 16));
2297       __ movq(Address(to, qword_count, Address::times_8, 16), rax);
2298       __ movq(rax, Address(from, qword_count, Address::times_8,  8));
2299       __ movq(Address(to, qword_count, Address::times_8,  8), rax);
2300       __ movq(rax, Address(from, qword_count, Address::times_8,  0));
2301       __ movq(Address(to, qword_count, Address::times_8,  0), rax);
2302 
2303       __ BIND(L_copy_bytes);
2304       __ subptr(qword_count, 4);
2305       __ jcc(Assembler::greaterEqual, L_loop);
2306     }
2307     __ BIND(L_end);
2308     __ addptr(qword_count, 4);
2309     __ jccb(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
2310 
2311     __ BIND(L_exit);
2312     restore_arg_regs();
2313     __ xorptr(rax, rax); // return 0
2314     __ vzeroupper();
2315     __ leave(); // required for proper stackwalking of RuntimeStub frame
2316     __ ret(0);
2317 
2318     // Copy trailing qwords
2319     __ BIND(L_copy_8_bytes);
2320     if (nt) {
2321       __ prefetchnta(Address(from, qword_count, Address::times_8, -8));
2322     }
2323     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2324     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2325     __ decrement(qword_count);
2326     __ jcc(Assembler::notZero, L_copy_8_bytes);
2327     __ jmp(L_exit);
2328 
2329     return start;
2330   }
2331 
2332   // Arguments:
2333   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2334   //             ignored
2335   //   name    - stub name string
2336   //
2337   // Inputs:
2338   //   c_rarg0   - source array address
2339   //   c_rarg1   - destination array address
2340   //   c_rarg2   - element count, treated as ssize_t, can be zero
2341   //
2342   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
2343   // we let the hardware handle it.  The one to eight bytes within words,
2344   // dwords or qwords that span cache line boundaries will still be loaded
2345   // and stored atomically.
2346   //
2347   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
2348                                       address* entry, const char *name) {
2349 #if COMPILER2_OR_JVMCI
2350     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2351        return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,

3874 
3875     // We don't generate specialized code for HeapWord-aligned source
3876     // arrays, so just use the code we've already generated
3877     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3878     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3879 
3880     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3881     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3882 
3883     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3884     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3885 
3886     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3887     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3888 
3889     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3890     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3891 
3892     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3893     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
3894 
3895     StubRoutines::_has_word_memcpy     = true;
3896     StubRoutines::_word_memcpy_up      = generate_disjoint_word_copy_up  (false, "word_memcpy_up");
3897     StubRoutines::_word_memcpy_up_nt   = generate_disjoint_word_copy_up  (true,  "word_memcpy_up_nt");
3898     StubRoutines::_word_memcpy_down    = generate_disjoint_word_copy_down(0,     "word_memcpy_down");
3899     StubRoutines::_word_memcpy_down_nt = generate_disjoint_word_copy_down(1,     "word_memcpy_down_nt");
3900   }
3901 
3902   // AES intrinsic stubs
3903   enum {AESBlockSize = 16};
3904 
3905   address generate_key_shuffle_mask() {
3906     __ align(16);
3907     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3908     address start = __ pc();
3909     __ emit_data64( 0x0405060700010203, relocInfo::none );
3910     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3911     return start;
3912   }
3913 
3914   address generate_counter_shuffle_mask() {
3915     __ align(16);
3916     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3917     address start = __ pc();
3918     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3919     __ emit_data64(0x0001020304050607, relocInfo::none);

7850     __ enter(); // required for proper stackwalking of RuntimeStub frame
7851 
7852 #ifdef _WIN64
7853     __ push(rsi);
7854     __ push(rdi);
7855 #endif
7856     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7857 
7858 #ifdef _WIN64
7859     __ pop(rdi);
7860     __ pop(rsi);
7861 #endif
7862 
7863     __ leave(); // required for proper stackwalking of RuntimeStub frame
7864     __ ret(0);
7865 
7866     return start;
7867 
7868   }
7869 
7870 RuntimeStub* generate_cont_doYield() {
7871     const char *name = "cont_doYield";
7872 
7873     enum layout {
7874       rbp_off,
7875       rbpH_off,
7876       return_off,
7877       return_off2,
7878       framesize // inclusive of return address
7879     };
7880     // assert(is_even(framesize/2), "sp not 16-byte aligned");
7881     
7882     int insts_size = 512;
7883     int locs_size  = 64;
7884     CodeBuffer code(name, insts_size, locs_size);
7885     OopMapSet* oop_maps  = new OopMapSet();
7886     MacroAssembler* masm = new MacroAssembler(&code);
7887     MacroAssembler* _masm = masm;
7888 
7889     address start = __ pc();
7890 
7891     __ enter();
7892 
7893     __ movptr(c_rarg1, rsp);
7894 
7895     int frame_complete = __ pc() - start;
7896     address the_pc = __ pc();
7897 
7898     __ post_call_nop(); // this must be exactly after the pc value that is pushed into the frame info, we use this nop for fast CodeBlob lookup
7899 
7900     if (ContPerfTest > 5) {
7901       __ movptr(c_rarg0, r15_thread);
7902       __ set_last_Java_frame(rsp, rbp, the_pc);
7903 
7904       __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::freeze), 2);
7905       
7906       __ reset_last_Java_frame(true);
7907     }
7908 
7909     Label pinned;
7910 
7911     if (ContPerfTest <= 5) { __ xorq(rax, rax); }
7912     __ testq(rax, rax);
7913     __ jcc(Assembler::notZero, pinned);
7914 
7915     __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
7916     continuation_enter_cleanup(masm);
7917     __ pop(rbp);
7918     __ ret(0);
7919 
7920     __ bind(pinned); // pinned -- return to caller
7921 
7922     __ leave();
7923     __ ret(0);
7924 
7925     OopMap* map = new OopMap(framesize, 1);
7926     // map->set_callee_saved(VMRegImpl::stack2reg(rbp_off), rbp->as_VMReg());
7927     oop_maps->add_gc_map(the_pc - start, map);
7928 
7929     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7930     RuntimeStub::new_runtime_stub(name,
7931                                   &code,
7932                                   frame_complete,
7933                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7934                                   oop_maps, false);
7935     return stub;
7936   }
7937 
7938   address generate_cont_jump_from_safepoint() {
7939     StubCodeMark mark(this, "StubRoutines","Continuation jump from safepoint");
7940 
7941     address start = __ pc();
7942 
7943     __ get_thread(r15_thread);
7944     __ reset_last_Java_frame(true); // false would be fine, too, I guess
7945     __ reinit_heapbase();
7946     
7947     __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
7948     continuation_enter_cleanup(_masm);
7949     __ pop(rbp);
7950     __ ret(0);
7951 
7952     return start;
7953   }
7954 
7955   address generate_cont_thaw(bool return_barrier, bool exception) {
7956     assert (return_barrier || !exception, "must be");
7957 
7958     address start = __ pc();
7959 
7960     // TODO: Handle Valhalla return types. May require generating different return barriers.
7961 
7962     if (!return_barrier) {
7963       __ pop(c_rarg3); // pop return address. if we don't do this, we get a drift, where the bottom-most frozen frame continuously grows
7964     } else {
7965       __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
7966     }
7967     assert_asm(_masm, cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())), Assembler::equal, "incorrect rsp");
7968 
7969     if (return_barrier) {
7970       __ push(rax); __ push_d(xmm0); // preserve possible return value from a method returning to the return barrier
7971     }
7972 
7973     __ movl(c_rarg1, (return_barrier ? 1 : 0) + (exception ? 1 : 0));
7974     if (ContPerfTest > 105) {
7975       __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), r15_thread, c_rarg1);
7976       __ movptr(rbx, rax); // rax contains the size of the frames to thaw, 0 if overflow or no more frames
7977     } else {
7978       __ xorq(rbx, rbx);
7979     }
7980     if (return_barrier) {
7981       __ pop_d(xmm0); __ pop(rax); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7982     }
7983     assert_asm(_masm, cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())), Assembler::equal, "incorrect rsp");
7984   // #ifdef ASSERT
7985   //   __ lea(rcx, Address(rsp, wordSize));
7986   //   assert_asm(_masm, cmpptr(rcx, Address(r15_thread, JavaThread::cont_entry_offset())), Assembler::equal, "incorrect rsp");
7987   // #endif
7988 
7989     Label thaw_success;
7990     __ testq(rbx, rbx);           // rbx contains the size of the frames to thaw, 0 if overflow or no more frames
7991     __ jcc(Assembler::notZero, thaw_success);
7992     __ jump(ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
7993     __ bind(thaw_success);
7994 
7995     __ subq(rsp, rbx);             // make room for the thawed frames
7996     __ andptr(rsp, -16);           // align
7997     
7998     if (return_barrier) {
7999       __ push(rax); __ push_d(xmm0); // save original return value -- again
8000     }
8001 
8002     __ movl(c_rarg1, (return_barrier ? 1 : 0) + (exception ? 1 : 0));
8003     if (ContPerfTest > 112) {
8004       __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::thaw), r15_thread, c_rarg1);
8005     }
8006     __ movptr(rbx, rax); // rax is the sp of the yielding frame
8007 
8008     if (return_barrier) {
8009       __ pop_d(xmm0); __ pop(rax); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
8010     } else {
8011       __ movl(rax, 0); // return 0 (success) from doYield
8012     }
8013 
8014     __ movptr(rsp, rbx); // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
8015     __ subptr(rsp, 2*wordSize); // now pointing to rbp spill
8016 
8017     if (exception) {
8018       __ movptr(c_rarg1, Address(rsp, wordSize)); // return address
8019       __ push(rax); // save return value contaning the exception oop
8020       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), r15_thread, c_rarg1);
8021       __ movptr(rbx, rax); // the exception handler
8022       __ pop(rax); // restore return value contaning the exception oop
8023       __ pop(rbp);
8024       __ pop(rdx); // rdx must contain the original pc in the case of exception; see OptoRuntime::generate_exception_blob
8025       __ jmp(rbx); // the exception handler
8026     }
8027 
8028     // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
8029     __ pop(rbp);
8030     __ ret(0);
8031 
8032     return start;
8033   }
8034 
8035   address generate_cont_thaw() {
8036     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
8037     address start = __ pc();
8038     generate_cont_thaw(false, false);
8039     return start;
8040   }
8041 
8042   address generate_cont_returnBarrier() {
8043     // TODO: will probably need multiple return barriers depending on return type
8044     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
8045     address start = __ pc();
8046 
8047     generate_cont_thaw(true, false);
8048 
8049     return start;
8050   }
8051 
8052   address generate_cont_returnBarrier_exception() {
8053     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
8054     address start = __ pc();
8055 
8056     generate_cont_thaw(true, true);
8057 
8058     return start;
8059   }
8060 
8061   address generate_cont_interpreter_forced_preempt_return() {
8062       StubCodeMark mark(this, "StubRoutines", "cont interpreter forced preempt return");
8063       address start = __ pc();
8064 
8065       // This is necessary for forced yields, as the return addres (in rbx) is captured in a call_VM, and skips the restoration of rbcp and locals
8066       // see InterpreterMacroAssembler::restore_bcp/restore_locals
8067       // TODO: use InterpreterMacroAssembler
8068       static const Register _locals_register = r14;
8069       static const Register _bcp_register    = r13;
8070 
8071       __ pop(rbp);
8072 
8073       __ movptr(_bcp_register,    Address(rbp, frame::interpreter_frame_bcp_offset    * wordSize));
8074       __ movptr(_locals_register, Address(rbp, frame::interpreter_frame_locals_offset * wordSize));
8075       // __ reinit_heapbase();
8076 
8077       __ ret(0);
8078 
8079       return start;
8080     }
8081 
8082 #if INCLUDE_JFR
8083 
8084   static void jfr_set_last_java_frame(MacroAssembler* _masm) {
8085     Register last_java_pc = c_rarg0;
8086     Register last_java_sp = c_rarg2;
8087     __ movptr(last_java_pc, Address(rsp, 0));
8088     __ lea(last_java_sp, Address(rsp, wordSize));
8089     __ vzeroupper();
8090     Address anchor_java_pc(r15_thread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
8091     __ movptr(anchor_java_pc, last_java_pc);
8092     __ movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
8093   }
8094 
8095   static void jfr_prologue(MacroAssembler* _masm) {
8096     jfr_set_last_java_frame(_masm);
8097     __ movptr(c_rarg0, r15_thread);
8098   }
8099 
8100   // Handle is dereference here using correct load constructs.
8101   static void jfr_epilogue(MacroAssembler* _masm) {
8102     __ reset_last_Java_frame(false);
8103     Label null_jobject;
8104     __ testq(rax, rax);
8105     __ jcc(Assembler::zero, null_jobject);
8106     DecoratorSet decorators = ACCESS_READ | IN_NATIVE;
8107     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
8108     bs->load_at(_masm, decorators, T_OBJECT, rax, Address(rax, 0), c_rarg1, r15_thread);
8109     __ bind(null_jobject);
8110   }
8111 
8112   // For c2: c_rarg0 is junk, c_rarg1 is the thread id. Call to runtime to write a checkpoint.
8113   // Runtime will return a jobject handle to the event writer. The handle is dereferenced and the return value
8114   // is the event writer oop.
8115   address generate_jfr_write_checkpoint() {
8116     StubCodeMark mark(this, "jfr_write_checkpoint", "JFR C2 support for Virtual Threads");
8117 
8118     address start = __ pc();
8119     jfr_prologue(_masm);
8120     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JFR_WRITE_CHECKPOINT_FUNCTION), 2);
8121     jfr_epilogue(_masm);
8122     __ ret(0);
8123 
8124     return start;
8125   }
8126 
8127   // For c1: call the corresponding runtime routine, it returns a jobject handle to the event writer.
8128   // The handle is dereferenced and the return value is the event writer oop.
8129   address generate_jfr_get_event_writer() {
8130     StubCodeMark mark(this, "jfr_get_event_writer", "JFR C1 support for Virtual Threads");
8131     address start = __ pc();
8132 
8133     jfr_prologue(_masm);
8134     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JFR_GET_EVENT_WRITER_FUNCTION), 1);
8135     jfr_epilogue(_masm);
8136     __ ret(0);
8137 
8138     return start;
8139   }
8140 
8141 #endif // INCLUDE_JFR
8142 
8143 #undef __
8144 #define __ masm->
8145 
8146   // Continuation point for throwing of implicit exceptions that are
8147   // not handled in the current activation. Fabricates an exception
8148   // oop and initiates normal exception dispatching in this
8149   // frame. Since we need to preserve callee-saved values (currently
8150   // only for C2, but done for C1 as well) we need a callee-saved oop
8151   // map and therefore have to make these stubs into RuntimeStubs
8152   // rather than BufferBlobs.  If the compiler needs all registers to
8153   // be preserved between the fault point and the exception handler
8154   // then it must assume responsibility for that in
8155   // AbstractCompiler::continuation_for_implicit_null_exception or
8156   // continuation_for_implicit_division_by_zero_exception. All other
8157   // implicit exceptions (e.g., NullPointerException or
8158   // AbstractMethodError on entry) are either at call sites or
8159   // otherwise assume that stack unwinding will be initiated, so
8160   // caller saved registers were assumed volatile in the compiler.
8161   address generate_throw_exception(const char* name,
8162                                    address runtime_entry,

8351       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8352         StubRoutines::_dsin = generate_libmSin();
8353       }
8354       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8355         StubRoutines::_dcos = generate_libmCos();
8356       }
8357       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
8358         StubRoutines::_dtan = generate_libmTan();
8359       }
8360     }
8361 
8362     // Safefetch stubs.
8363     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
8364                                                        &StubRoutines::_safefetch32_fault_pc,
8365                                                        &StubRoutines::_safefetch32_continuation_pc);
8366     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
8367                                                        &StubRoutines::_safefetchN_fault_pc,
8368                                                        &StubRoutines::_safefetchN_continuation_pc);
8369   }
8370 
8371   void generate_phase1() {
8372     // Continuation stubs:
8373     StubRoutines::_cont_thaw          = generate_cont_thaw();
8374     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8375     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8376     StubRoutines::_cont_doYield_stub = generate_cont_doYield();
8377     StubRoutines::_cont_doYield    = StubRoutines::_cont_doYield_stub->entry_point();
8378     StubRoutines::_cont_jump_from_sp = generate_cont_jump_from_safepoint();
8379     StubRoutines::_cont_interpreter_forced_preempt_return = generate_cont_interpreter_forced_preempt_return();
8380 
8381     JFR_ONLY(StubRoutines::_jfr_write_checkpoint = generate_jfr_write_checkpoint();)
8382     JFR_ONLY(StubRoutines::_jfr_get_event_writer = generate_jfr_get_event_writer();)
8383   }
8384 
8385   void generate_all() {
8386     // Generates all stubs and initializes the entry points
8387 
8388     // These entry points require SharedInfo::stack0 to be set up in
8389     // non-core builds and need to be relocatable, so they each
8390     // fabricate a RuntimeStub internally.
8391     StubRoutines::_throw_AbstractMethodError_entry =
8392       generate_throw_exception("AbstractMethodError throw_exception",
8393                                CAST_FROM_FN_PTR(address,
8394                                                 SharedRuntime::
8395                                                 throw_AbstractMethodError));
8396 
8397     StubRoutines::_throw_IncompatibleClassChangeError_entry =
8398       generate_throw_exception("IncompatibleClassChangeError throw_exception",
8399                                CAST_FROM_FN_PTR(address,
8400                                                 SharedRuntime::
8401                                                 throw_IncompatibleClassChangeError));
8402 
8403     StubRoutines::_throw_NullPointerException_at_call_entry =
8404       generate_throw_exception("NullPointerException at call throw_exception",

8622         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libsvml, ebuf);
8623 
8624         snprintf(ebuf, sizeof(ebuf), "__svml_%s1_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8625         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsvml, ebuf);
8626 
8627         snprintf(ebuf, sizeof(ebuf), "__svml_%s2_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8628         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsvml, ebuf);
8629 
8630         snprintf(ebuf, sizeof(ebuf), "__svml_%s4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8631         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libsvml, ebuf);
8632       }
8633     }
8634 #endif // COMPILER2
8635 
8636     if (UseVectorizedMismatchIntrinsic) {
8637       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
8638     }
8639   }
8640 
8641  public:
8642   StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) {
8643     if (phase == 0) {


8644       generate_initial();
8645     } else if (phase == 1) {
8646       generate_phase1(); // stubs that must be available for the interpreter
8647     } else {
8648       generate_all();
8649     }
8650   }
8651 }; // end class declaration
8652 
8653 #define UCM_TABLE_MAX_ENTRIES 16
8654 void StubGenerator_generate(CodeBuffer* code, int phase) {
8655   if (UnsafeCopyMemory::_table == NULL) {
8656     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
8657   }
8658   StubGenerator g(code, phase);
8659 }
8660 
8661 #undef __
8662 #define __ masm->
8663 
8664 // on exit, rsp points to the ContinuationEntry
8665 // kills rax
8666 OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
8667   assert (ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
8668   assert (in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
8669   assert (in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
8670 
8671   stack_slots += (int)ContinuationEntry::size()/wordSize;
8672   __ subptr(rsp, (int32_t)ContinuationEntry::size()); // place Continuation metadata
8673 
8674   OopMap* map = new OopMap(((int)ContinuationEntry::size() + wordSize)/ VMRegImpl::stack_slot_size, 0 /* arg_slots*/);
8675   ContinuationEntry::setup_oopmap(map);
8676 
8677   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
8678   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
8679   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
8680 
8681   return map;
8682 }
8683 
8684 // on entry c_rarg1 points to the continuation 
8685 //          rsp points to ContinuationEntry
8686 // kills rax
8687 void fill_continuation_entry(MacroAssembler* masm) {
8688   DEBUG_ONLY(__ movl(Address(rsp, ContinuationEntry::cookie_offset()), 0x1234);)
8689 
8690   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), c_rarg1);
8691   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), (int32_t)0);
8692   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), (int32_t)0);
8693 
8694   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
8695   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
8696   __ movl(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
8697   __ movl(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
8698   
8699   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
8700   __ reset_held_monitor_count(r15_thread);
8701 }
8702 
8703 // on entry, rsp points to the ContinuationEntry
8704 // on exit, rsp points to the spilled rbp in the entry frame
8705 // kills rbx, rcx
8706 void continuation_enter_cleanup(MacroAssembler* masm) {
8707 #ifndef PRODUCT
8708   Label OK;
8709   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
8710   __ jcc(Assembler::equal, OK);
8711   __ stop("incorrect rsp1");
8712   __ bind(OK);
8713 #endif
8714   
8715   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
8716   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
8717   __ movl(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
8718   __ movl(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
8719 
8720   __ movptr(rcx, Address(rsp, ContinuationEntry::parent_offset()));
8721   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rcx);
8722   __ addptr(rsp, (int32_t)ContinuationEntry::size());
8723 }
8724 
8725 #undef __
< prev index next >