< prev index next >

src/hotspot/cpu/x86/stubGenerator_x86_64.cpp

Print this page
*** 36,12 ***
--- 36,14 ---
  #include "nativeInst_x86.hpp"
  #include "oops/instanceOop.hpp"
  #include "oops/method.hpp"
  #include "oops/objArrayKlass.hpp"
  #include "oops/oop.inline.hpp"
+ #include "prims/jvmtiExport.hpp"
  #include "prims/methodHandles.hpp"
  #include "runtime/arguments.hpp"
+ #include "runtime/continuation.hpp"
  #include "runtime/frame.inline.hpp"
  #include "runtime/handles.inline.hpp"
  #include "runtime/sharedRuntime.hpp"
  #include "runtime/stubCodeGenerator.hpp"
  #include "runtime/stubRoutines.hpp"

*** 53,10 ***
--- 55,13 ---
  #include "jvmci/jvmci_globals.hpp"
  #endif
  #if INCLUDE_ZGC
  #include "gc/z/zThreadLocalData.hpp"
  #endif
+ #if INCLUDE_JFR
+ #include "jfr/support/jfrIntrinsics.hpp"
+ #endif
  
  // Declaration and definition of StubGenerator (no .hpp file).
  // For a more detailed description of the stub routine structure
  // see the comment in stubRoutines.hpp
  

*** 71,10 ***
--- 76,14 ---
  #endif
  
  #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  
+ OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots);
+ void fill_continuation_entry(MacroAssembler* masm);
+ void continuation_enter_cleanup(MacroAssembler* masm);
+ 
  // Stub Code definitions
  
  class StubGenerator: public StubCodeGenerator {
   private:
  

*** 381,10 ***
--- 390,12 ---
        __ stop("StubRoutines::call_stub: threads must correspond");
        __ bind(L3);
      }
  #endif
  
+     __ pop_cont_fastpath(r15_thread);
+ 
      // restore regs belonging to calling function
  #ifdef _WIN64
      // emit the restores for xmm regs
      if (VM_Version::supports_evex()) {
        for (int i = xmm_save_first; i <= last_reg; i++) {

*** 1883,10 ***
--- 1894,443 ---
        __ jmp(L_copy_4_bytes);
      }
      return start;
    }
  
+   // Fast memory copying for continuations
+   // See:
+   // - Intel 64 and IA-32 Architectures Optimization Reference Manual: (https://software.intel.com/sites/default/files/managed/9e/bc/64-ia-32-architectures-optimization-manual.pdf)
+   //   - 2.7.6 REP String Enhancement
+   //   - 3.7.5 REP Prefix and Data Movement
+   //   - 3.7.6 Enhanced REP MOVSB and STOSB Operation
+   //   - 8.1 GENERAL PREFETCH CODING GUIDELINES
+   //   - 8.4.1.2 Streaming Non-temporal Stores, 8.4.1.3 Memory Type and Non-temporal Stores
+   //   - 8.5 MEMORY OPTIMIZATION USING PREFETCH, 8.5.6 Software Prefetch Scheduling Distance, 8.5.7 Software Prefetch Concatenation
+   //   - 14.3, MIXING AVX CODE WITH SSE CODE + https://software.intel.com/en-us/articles/intel-avx-state-transitions-migrating-sse-code-to-avx
+   // - Optimizing subroutines in assembly language, 17.9 Moving blocks of data https://www.agner.org/optimize/optimizing_assembly.pdf
+   // - StackOverflow
+   //   - https://stackoverflow.com/q/26246040/750563 What's missing/sub-optimal in this memcpy implementation?
+   //   - https://stackoverflow.com/q/43343231/750563 Enhanced REP MOVSB for memcpy
+   //   - https://stackoverflow.com/q/33902068/750563 What setup does REP do?
+   //   - https://stackoverflow.com/q/8858778/750563  Why are complicated memcpy/memset superior?
+   //   - https://stackoverflow.com/q/1715224/750563  Very fast memcpy for image processing?
+   //   - https://stackoverflow.com/q/17312823/750563 When program will benefit from prefetch & non-temporal load/store?
+   //   - https://stackoverflow.com/q/40096894/750563 Do current x86 architectures support non-temporal loads (from “normal” memory)?
+   //   - https://stackoverflow.com/q/32103968/750563 Non-temporal loads and the hardware prefetcher, do they work together?
+   // - https://docs.roguewave.com/threadspotter/2011.2/manual_html_linux/manual_html/ch05s03.html Non-Temporal Data
+   // - https://blogs.fau.de/hager/archives/2103 A case for the non-temporal store
+   // - https://vgatherps.github.io/2018-09-02-nontemporal/ Optimizing Cache Usage With Nontemporal Accesses
+   // - https://www.reddit.com/r/cpp/comments/9ccb88/optimizing_cache_usage_with_nontemporal_accesses/
+   // - https://lwn.net/Articles/255364/ Memory part 5: What programmers can do
+   // - https://software.intel.com/en-us/forums/intel-isa-extensions/topic/597075 Do Non-Temporal Loads Prefetch?
+   // - https://software.intel.com/en-us/forums/intel-fortran-compiler/topic/275765#comment-1551057 Time to revisit REP;MOVS
+   // - https://lemire.me/blog/2018/09/07/avx-512-when-and-how-to-use-these-new-instructions/ AVX-512: when and how to use these new instructions (explains AVX3Threshold)
+   // - https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html Gathering Intel on Intel AVX-512 Transitions
+ 
+ 
+   // Used by continuations to copy from stack
+   // Arguments:
+   //   name - stub name string
+   //   nt   -  use non-temporal stores
+   //
+   // Inputs:
+   //   c_rarg0   - source array address       -- 16-byte aligned
+   //   c_rarg1   - destination array address  --  8-byte aligned
+   //   c_rarg2   - element count, in qwords (8 bytes), >= 2
+   //
+   address generate_disjoint_word_copy_up(bool nt, const char *name) {
+     const bool align = nt;
+ 
+     __ align(CodeEntryAlignment);
+     StubCodeMark mark(this, "StubRoutines", name);
+     address start = __ pc();
+ 
+     Label L_copy_bytes, L_copy_8_bytes, L_loop, L_end, L_exit;
+     const Register from        = rdi;  // source array address
+     const Register to          = rsi;  // destination array address
+     const Register count       = rdx;  // elements count
+     const Register qword_count = count;
+     const Register end_from    = from; // source array end address
+     const Register end_to      = to;   // destination array end address
+     const Register alignment   = rcx;
+ 
+     // End pointers are inclusive, and if count is not zero they point
+     // to the last unit copied:  end_to[0] := end_from[0]
+ 
+     __ enter(); // required for proper stackwalking of RuntimeStub frame
+     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
+ 
+     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
+                       // r9 and r10 may be used to save non-volatile registers
+ 
+     // Copy from low to high addresses.
+     // By pointing to the end and negating qword_count we:
+     // 1. only update count, not from/tp; 2. don't need another register to hold total count; 3. can jcc right after addptr without cmpptr
+ 
+     // __ movptr(alignment, to);
+     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
+     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
+     __ negptr(qword_count); // make the count negative
+     // Address(end_from/to, qword_count, Address::times_8) now points 8 bytes *below* to original from/to
+     // i.e. orig to == Address(end_to, qword_count, Address::times_8, 8)
+ 
+     // Copy in multi-bytes chunks
+ 
+     if (UseUnalignedLoadStores) {
+       if (align) { // align target
+         NearLabel L_aligned_128, L_aligned_256, L_aligned_512;
+ 
+         __ lea(alignment, Address(end_to, qword_count, Address::times_8, 8)); // == original to
+         __ negptr(alignment); // we align by copying from the beginning of to, making it effectively larger
+ 
+         __ testl(alignment, 8);
+         __ jccb(Assembler::zero, L_aligned_128);
+         __ increment(qword_count);
+         // no need to test because we know qword_count >= 2
+         __ movq(rax, Address(end_from, qword_count, Address::times_8, -0));
+         __ movqa(Address(end_to, qword_count, Address::times_8, -0), rax, nt);
+         __ bind(L_aligned_128);
+ 
+         if (UseAVX >= 2) {
+           __ testl(alignment, 16);
+           __ jccb(Assembler::zero, L_aligned_256);
+           __ cmpptr(qword_count, -2);
+           if (UseAVX > 2) {
+             __ jcc(Assembler::greater, L_copy_8_bytes);
+           } else {
+             __ jccb(Assembler::greater, L_copy_8_bytes);
+           }
+           __ addptr(qword_count, 2);
+           __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -8));
+           __ movdqa(Address(end_to, qword_count, Address::times_8, -8), xmm0, nt);
+           __ bind(L_aligned_256);
+           // we can move from SSE to AVX without penalty, but not the other way around
+         }
+ 
+         if (UseAVX > 2) {
+           __ testl(alignment, 32);
+           __ jccb(Assembler::zero, L_aligned_512);
+           __ addptr(qword_count, 4);
+           __ jccb(Assembler::less, L_end);
+           __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
+           __ vmovdqa(Address(end_to, qword_count, Address::times_8, -24), xmm0, nt);
+           __ bind(L_aligned_512);
+         }
+       }
+ 
+       // Copy 64-bytes per iteration
+       if (UseAVX > 2) {
+         Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
+ 
+         __ BIND(L_copy_bytes);
+         __ cmpptr(qword_count, (-1 * AVX3Threshold / 8));
+         __ jccb(Assembler::less, L_above_threshold);
+         __ jmpb(L_below_threshold);
+ 
+         __ align(OptoLoopAlignment);
+         __ bind(L_loop_avx512);
+         __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
+         __ evmovdqa(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit, nt);
+         __ bind(L_above_threshold);
+         __ addptr(qword_count, 8);
+         __ jcc(Assembler::lessEqual, L_loop_avx512);
+         __ jmpb(L_32_byte_head);
+ 
+         __ bind(L_loop_avx2);
+         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
+         __ vmovdqa(Address(end_to, qword_count, Address::times_8, -56), xmm0, nt);
+         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
+         __ vmovdqa(Address(end_to, qword_count, Address::times_8, -24), xmm1, nt);
+         __ bind(L_below_threshold);
+         __ addptr(qword_count, 8);
+         __ jcc(Assembler::lessEqual, L_loop_avx2);
+ 
+         __ bind(L_32_byte_head);
+         __ subptr(qword_count, 4);  // sub(8) and add(4)
+         __ jccb(Assembler::greater, L_end);
+       } else {
+         __ jmp(L_copy_bytes);
+         __ align(OptoLoopAlignment);
+         __ BIND(L_loop);
+         if (UseAVX == 2) {
+           __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
+           __ vmovdqa(Address(end_to, qword_count, Address::times_8, -56), xmm0, nt);
+           __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
+           __ vmovdqa(Address(end_to, qword_count, Address::times_8, -24), xmm1, nt);
+         } else {
+           __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
+           __ movdqa(Address(end_to, qword_count, Address::times_8, -56), xmm0, nt);
+           __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
+           __ movdqa(Address(end_to, qword_count, Address::times_8, -40), xmm1, nt);
+           __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
+           __ movdqa(Address(end_to, qword_count, Address::times_8, -24), xmm2, nt);
+           __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
+           __ movdqa(Address(end_to, qword_count, Address::times_8, - 8), xmm3, nt);
+         }
+ 
+         __ BIND(L_copy_bytes);
+         __ addptr(qword_count, 8);
+         __ jcc(Assembler::lessEqual, L_loop);
+         __ subptr(qword_count, 4);  // sub(8) and add(4); we added the extra 8 at the end of the loop; we'll subtract the extra 4 right before "copy trailing qwords"
+         __ jccb(Assembler::greater, L_end);
+       }
+       // Copy trailing 32 bytes
+       if (UseAVX >= 2) {
+         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
+         __ vmovdqa(Address(end_to, qword_count, Address::times_8, -24), xmm0, nt);
+       } else {
+         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
+         __ movdqa(Address(end_to, qword_count, Address::times_8, -24), xmm0, nt);
+         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
+         __ movdqa(Address(end_to, qword_count, Address::times_8, - 8), xmm1, nt);
+       }
+       __ addptr(qword_count, 4);
+     } else {
+       // Copy 32-bytes per iteration
+       __ jmp(L_copy_bytes);
+       __ align(OptoLoopAlignment);
+       __ BIND(L_loop);
+       __ movq(rax, Address(end_from, qword_count, Address::times_8, -24));
+       __ movqa(Address(end_to, qword_count, Address::times_8, -24), rax, nt);
+       __ movq(rax, Address(end_from, qword_count, Address::times_8, -16));
+       __ movqa(Address(end_to, qword_count, Address::times_8, -16), rax, nt);
+       __ movq(rax, Address(end_from, qword_count, Address::times_8, - 8));
+       __ movqa(Address(end_to, qword_count, Address::times_8, - 8), rax, nt);
+       __ movq(rax, Address(end_from, qword_count, Address::times_8, - 0));
+       __ movqa(Address(end_to, qword_count, Address::times_8, - 0), rax, nt);
+ 
+       __ BIND(L_copy_bytes);
+       __ addptr(qword_count, 4);
+       __ jcc(Assembler::lessEqual, L_loop);
+     }
+     __ BIND(L_end);
+     __ subptr(qword_count, 4);
+     __ jccb(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
+ 
+     __ BIND(L_exit);
+     restore_arg_regs();
+     __ xorptr(rax, rax); // return 0
+     __ vzeroupper();
+     __ leave(); // required for proper stackwalking of RuntimeStub frame
+     __ ret(0);
+ 
+     // Copy trailing qwords
+     __ BIND(L_copy_8_bytes);
+     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
+     __ movqa(Address(end_to, qword_count, Address::times_8, 8), rax, nt);
+     __ increment(qword_count);
+     __ jcc(Assembler::notZero, L_copy_8_bytes);
+     __ jmp(L_exit);
+ 
+     return start;
+   }
+ 
+   // Used by continuations to copy to stack
+   // Arguments:
+   //   name    - stub name string
+   //   nt_mode - 0 - none, 1 - use non-temporal prefetches, 2 - use non-temporal loads
+   //
+   // Inputs:
+   //   c_rarg0   - source array address      --  8-byte aligned
+   //   c_rarg1   - destination array address -- 16-byte aligned
+   //   c_rarg2   - element count, in qwords (8 bytes), >= 2
+   //
+   address generate_disjoint_word_copy_down(int nt_mode, const char *name) {
+     const bool prefetchnt = (nt_mode == 1);
+     const bool nt         = (nt_mode == 2);
+     const bool align      = nt;
+ 
+     __ align(CodeEntryAlignment);
+     StubCodeMark mark(this, "StubRoutines", name);
+     address start = __ pc();
+ 
+     Label L_copy_bytes, L_copy_8_bytes, L_loop, L_end, L_exit;
+     const Register from        = rdi;  // source array address
+     const Register to          = rsi;  // destination array address
+     const Register count       = rdx;  // elements count
+     const Register qword_count = count;
+     const Register alignment   = rcx; // rbx causes trouble
+ 
+     __ enter(); // required for proper stackwalking of RuntimeStub frame
+     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
+ 
+     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
+                       // r9 and r10 may be used to save non-volatile registers
+ 
+     // Copy from high to low addresses.
+ 
+     // Copy in multi-bytes chunks
+ 
+     if (UseUnalignedLoadStores) {
+       if (align) { // align source (only useful for nt)
+         NearLabel L_aligned_128, L_aligned_256, L_aligned_512;
+ 
+         __ lea(alignment, Address(from, qword_count, Address::times_8, 0)); // == original to
+ 
+         __ testl(alignment, 8);
+         __ jccb(Assembler::zero, L_aligned_128);
+         __ decrement(qword_count);
+         // no need to test because we know qword_count >= 2
+         __ movdqa(xmm0, Address(from, qword_count, Address::times_8, 0), nt); // no 8-byte nt load
+         __ psrldq(xmm0, 8); // movlhps(xmm0, xmm0);
+         __ movdq(rax, xmm0);
+         // __ movq(rax, Address(from, qword_count, Address::times_8, 0));
+         __ movq(Address(to, qword_count, Address::times_8, 0), rax);
+         __ bind(L_aligned_128);
+ 
+         if (UseAVX >= 2) {
+           __ testl(alignment, 16);
+           __ jccb(Assembler::zero, L_aligned_256);
+           __ cmpptr(qword_count, 2);
+           if (UseAVX > 2) {
+             __ jcc(Assembler::less, L_copy_8_bytes);
+           } else {
+             __ jccb(Assembler::less, L_copy_8_bytes);
+           }
+           __ subptr(qword_count, 2);
+           __ movdqa(xmm0, Address(from, qword_count, Address::times_8, 0), nt);
+           __ movdqu(Address(to, qword_count, Address::times_8, 0), xmm0);
+           __ bind(L_aligned_256);
+           // we can move from SSE to AVX without penalty, but not the other way around
+         }
+ 
+         if (UseAVX > 2) {
+           __ testl(alignment, 32);
+           __ jccb(Assembler::zero, L_aligned_512);
+           __ subptr(qword_count, 4);
+           __ jccb(Assembler::less, L_end);
+           __ vmovdqa(xmm0, Address(from, qword_count, Address::times_8, 0), nt);
+           __ vmovdqu(Address(to, qword_count, Address::times_8, 0), xmm0);
+           __ bind(L_aligned_512);
+         }
+       }
+ 
+       // Copy 64-bytes per iteration
+       const int prefetch_distance = 2 * 64; // prefetch distance of 2
+       if (UseAVX > 2) {
+         Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
+ 
+         __ BIND(L_copy_bytes);
+         __ cmpptr(qword_count, (AVX3Threshold / 8));
+         __ jccb(Assembler::greater, L_above_threshold);
+         __ jmpb(L_below_threshold);
+ 
+         __ align(OptoLoopAlignment);
+         __ BIND(L_loop_avx512);
+         if (prefetchnt) {
+           __ prefetchnta(Address(from, qword_count, Address::times_8, -prefetch_distance));
+         }
+         __ evmovdqa(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit, nt);
+         __ evmovdqul(Address(to, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
+         __ bind(L_above_threshold);
+         __ subptr(qword_count, 8);
+         __ jcc(Assembler::greaterEqual, L_loop_avx512);
+         __ jmpb(L_32_byte_head);
+ 
+         __ bind(L_loop_avx2);
+         if (prefetchnt) {
+           __ prefetchnta(Address(from, qword_count, Address::times_8, -prefetch_distance));
+         }
+         __ vmovdqa(xmm0, Address(from, qword_count, Address::times_8, 32), nt);
+         __ vmovdqu(Address(to, qword_count, Address::times_8, 32), xmm0);
+         __ vmovdqa(xmm1, Address(from, qword_count, Address::times_8, 0), nt);
+         __ vmovdqu(Address(to, qword_count, Address::times_8, 0), xmm1);
+         __ bind(L_below_threshold);
+         __ subptr(qword_count, 8);
+         __ jcc(Assembler::greaterEqual, L_loop_avx2);
+ 
+         __ bind(L_32_byte_head);
+         __ addptr(qword_count, 4);  // add(8) and sub(4)
+         __ jccb(Assembler::less, L_end);
+       } else {
+         __ jmp(L_copy_bytes);
+         __ align(OptoLoopAlignment);
+         __ BIND(L_loop);
+         if (prefetchnt) {
+           __ prefetchnta(Address(from, qword_count, Address::times_8, -prefetch_distance));
+         }
+         if (UseAVX == 2) {
+           __ vmovdqa(xmm0, Address(from, qword_count, Address::times_8, 32), nt);
+           __ vmovdqu(Address(to, qword_count, Address::times_8, 32), xmm0);
+           __ vmovdqa(xmm1, Address(from, qword_count, Address::times_8,  0), nt);
+           __ vmovdqu(Address(to, qword_count, Address::times_8,  0), xmm1);
+         } else {
+           __ movdqa(xmm0, Address(from, qword_count, Address::times_8, 48), nt);
+           __ movdqu(Address(to, qword_count, Address::times_8, 48), xmm0);
+           __ movdqa(xmm1, Address(from, qword_count, Address::times_8, 32), nt);
+           __ movdqu(Address(to, qword_count, Address::times_8, 32), xmm1);
+           __ movdqa(xmm2, Address(from, qword_count, Address::times_8, 16), nt);
+           __ movdqu(Address(to, qword_count, Address::times_8, 16), xmm2);
+           __ movdqa(xmm3, Address(from, qword_count, Address::times_8,  0), nt);
+           __ movdqu(Address(to, qword_count, Address::times_8,  0), xmm3);
+         }
+ 
+         __ BIND(L_copy_bytes);
+         __ subptr(qword_count, 8);
+         __ jcc(Assembler::greaterEqual, L_loop);
+ 
+         __ addptr(qword_count, 4);  // add(8) and sub(4)
+         __ jccb(Assembler::less, L_end);
+       }
+       // Copy trailing 32 bytes
+       if (UseAVX >= 2) {
+         __ vmovdqa(xmm0, Address(from, qword_count, Address::times_8, 0), nt);
+         __ vmovdqu(Address(to, qword_count, Address::times_8, 0), xmm0);
+       } else {
+         __ movdqa(xmm0, Address(from, qword_count, Address::times_8, 16), nt);
+         __ movdqu(Address(to, qword_count, Address::times_8, 16), xmm0);
+         __ movdqa(xmm1, Address(from, qword_count, Address::times_8,  0), nt);
+         __ movdqu(Address(to, qword_count, Address::times_8,  0), xmm1);
+       }
+       __ subptr(qword_count, 4);
+     } else {
+       // Copy 32-bytes per iteration
+       const int prefetch_distance = 4 * 32; // prefetch distance of 4
+       __ jmp(L_copy_bytes);
+       __ align(OptoLoopAlignment);
+       __ BIND(L_loop);
+       if (prefetchnt) {
+         __ prefetchnta(Address(from, qword_count, Address::times_8, -prefetch_distance));
+       }
+       __ movq(rax, Address(from, qword_count, Address::times_8, 24));
+       __ movq(Address(to, qword_count, Address::times_8, 24), rax);
+       __ movq(rax, Address(from, qword_count, Address::times_8, 16));
+       __ movq(Address(to, qword_count, Address::times_8, 16), rax);
+       __ movq(rax, Address(from, qword_count, Address::times_8,  8));
+       __ movq(Address(to, qword_count, Address::times_8,  8), rax);
+       __ movq(rax, Address(from, qword_count, Address::times_8,  0));
+       __ movq(Address(to, qword_count, Address::times_8,  0), rax);
+ 
+       __ BIND(L_copy_bytes);
+       __ subptr(qword_count, 4);
+       __ jcc(Assembler::greaterEqual, L_loop);
+     }
+     __ BIND(L_end);
+     __ addptr(qword_count, 4);
+     __ jccb(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
+ 
+     __ BIND(L_exit);
+     restore_arg_regs();
+     __ xorptr(rax, rax); // return 0
+     __ vzeroupper();
+     __ leave(); // required for proper stackwalking of RuntimeStub frame
+     __ ret(0);
+ 
+     // Copy trailing qwords
+     __ BIND(L_copy_8_bytes);
+     if (nt) {
+       __ prefetchnta(Address(from, qword_count, Address::times_8, -8));
+     }
+     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
+     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
+     __ decrement(qword_count);
+     __ jcc(Assembler::notZero, L_copy_8_bytes);
+     __ jmp(L_exit);
+ 
+     return start;
+   }
+ 
    // Arguments:
    //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
    //             ignored
    //   name    - stub name string
    //

*** 3445,10 ***
--- 3889,16 ---
      StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
      StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
  
      StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
      StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
+ 
+     StubRoutines::_has_word_memcpy     = true;
+     StubRoutines::_word_memcpy_up      = generate_disjoint_word_copy_up  (false, "word_memcpy_up");
+     StubRoutines::_word_memcpy_up_nt   = generate_disjoint_word_copy_up  (true,  "word_memcpy_up_nt");
+     StubRoutines::_word_memcpy_down    = generate_disjoint_word_copy_down(0,     "word_memcpy_down");
+     StubRoutines::_word_memcpy_down_nt = generate_disjoint_word_copy_down(1,     "word_memcpy_down_nt");
    }
  
    // AES intrinsic stubs
    enum {AESBlockSize = 16};
  

*** 7415,10 ***
--- 7865,283 ---
  
      return start;
  
    }
  
+ RuntimeStub* generate_cont_doYield() {
+     const char *name = "cont_doYield";
+ 
+     enum layout {
+       rbp_off,
+       rbpH_off,
+       return_off,
+       return_off2,
+       framesize // inclusive of return address
+     };
+     // assert(is_even(framesize/2), "sp not 16-byte aligned");
+     
+     int insts_size = 512;
+     int locs_size  = 64;
+     CodeBuffer code(name, insts_size, locs_size);
+     OopMapSet* oop_maps  = new OopMapSet();
+     MacroAssembler* masm = new MacroAssembler(&code);
+     MacroAssembler* _masm = masm;
+ 
+     address start = __ pc();
+ 
+     __ enter();
+ 
+     __ movptr(c_rarg1, rsp);
+ 
+     int frame_complete = __ pc() - start;
+     address the_pc = __ pc();
+ 
+     __ post_call_nop(); // this must be exactly after the pc value that is pushed into the frame info, we use this nop for fast CodeBlob lookup
+ 
+     if (ContPerfTest > 5) {
+       __ movptr(c_rarg0, r15_thread);
+       __ set_last_Java_frame(rsp, rbp, the_pc);
+ 
+       __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::freeze), 2);
+       
+       __ reset_last_Java_frame(true);
+     }
+ 
+     Label pinned;
+ 
+     if (ContPerfTest <= 5) { __ xorq(rax, rax); }
+     __ testq(rax, rax);
+     __ jcc(Assembler::notZero, pinned);
+ 
+     __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
+     continuation_enter_cleanup(masm);
+     __ pop(rbp);
+     __ ret(0);
+ 
+     __ bind(pinned); // pinned -- return to caller
+ 
+     __ leave();
+     __ ret(0);
+ 
+     OopMap* map = new OopMap(framesize, 1);
+     // map->set_callee_saved(VMRegImpl::stack2reg(rbp_off), rbp->as_VMReg());
+     oop_maps->add_gc_map(the_pc - start, map);
+ 
+     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
+     RuntimeStub::new_runtime_stub(name,
+                                   &code,
+                                   frame_complete,
+                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
+                                   oop_maps, false);
+     return stub;
+   }
+ 
+   address generate_cont_jump_from_safepoint() {
+     StubCodeMark mark(this, "StubRoutines","Continuation jump from safepoint");
+ 
+     address start = __ pc();
+ 
+     __ get_thread(r15_thread);
+     __ reset_last_Java_frame(true); // false would be fine, too, I guess
+     __ reinit_heapbase();
+     
+     __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
+     continuation_enter_cleanup(_masm);
+     __ pop(rbp);
+     __ ret(0);
+ 
+     return start;
+   }
+ 
+   address generate_cont_thaw(bool return_barrier, bool exception) {
+     assert (return_barrier || !exception, "must be");
+ 
+     address start = __ pc();
+ 
+     // TODO: Handle Valhalla return types. May require generating different return barriers.
+ 
+     if (!return_barrier) {
+       __ pop(c_rarg3); // pop return address. if we don't do this, we get a drift, where the bottom-most frozen frame continuously grows
+     } else {
+       __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
+     }
+     assert_asm(_masm, cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())), Assembler::equal, "incorrect rsp");
+ 
+     if (return_barrier) {
+       __ push(rax); __ push_d(xmm0); // preserve possible return value from a method returning to the return barrier
+     }
+ 
+     __ movl(c_rarg1, (return_barrier ? 1 : 0) + (exception ? 1 : 0));
+     if (ContPerfTest > 105) {
+       __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), r15_thread, c_rarg1);
+       __ movptr(rbx, rax); // rax contains the size of the frames to thaw, 0 if overflow or no more frames
+     } else {
+       __ xorq(rbx, rbx);
+     }
+     if (return_barrier) {
+       __ pop_d(xmm0); __ pop(rax); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
+     }
+     assert_asm(_masm, cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())), Assembler::equal, "incorrect rsp");
+   // #ifdef ASSERT
+   //   __ lea(rcx, Address(rsp, wordSize));
+   //   assert_asm(_masm, cmpptr(rcx, Address(r15_thread, JavaThread::cont_entry_offset())), Assembler::equal, "incorrect rsp");
+   // #endif
+ 
+     Label thaw_success;
+     __ testq(rbx, rbx);           // rbx contains the size of the frames to thaw, 0 if overflow or no more frames
+     __ jcc(Assembler::notZero, thaw_success);
+     __ jump(ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
+     __ bind(thaw_success);
+ 
+     __ subq(rsp, rbx);             // make room for the thawed frames
+     __ andptr(rsp, -16);           // align
+     
+     if (return_barrier) {
+       __ push(rax); __ push_d(xmm0); // save original return value -- again
+     }
+ 
+     __ movl(c_rarg1, (return_barrier ? 1 : 0) + (exception ? 1 : 0));
+     if (ContPerfTest > 112) {
+       __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::thaw), r15_thread, c_rarg1);
+     }
+     __ movptr(rbx, rax); // rax is the sp of the yielding frame
+ 
+     if (return_barrier) {
+       __ pop_d(xmm0); __ pop(rax); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
+     } else {
+       __ movl(rax, 0); // return 0 (success) from doYield
+     }
+ 
+     __ movptr(rsp, rbx); // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
+     __ subptr(rsp, 2*wordSize); // now pointing to rbp spill
+ 
+     if (exception) {
+       __ movptr(c_rarg1, Address(rsp, wordSize)); // return address
+       __ push(rax); // save return value contaning the exception oop
+       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), r15_thread, c_rarg1);
+       __ movptr(rbx, rax); // the exception handler
+       __ pop(rax); // restore return value contaning the exception oop
+       __ pop(rbp);
+       __ pop(rdx); // rdx must contain the original pc in the case of exception; see OptoRuntime::generate_exception_blob
+       __ jmp(rbx); // the exception handler
+     }
+ 
+     // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
+     __ pop(rbp);
+     __ ret(0);
+ 
+     return start;
+   }
+ 
+   address generate_cont_thaw() {
+     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
+     address start = __ pc();
+     generate_cont_thaw(false, false);
+     return start;
+   }
+ 
+   address generate_cont_returnBarrier() {
+     // TODO: will probably need multiple return barriers depending on return type
+     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
+     address start = __ pc();
+ 
+     generate_cont_thaw(true, false);
+ 
+     return start;
+   }
+ 
+   address generate_cont_returnBarrier_exception() {
+     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
+     address start = __ pc();
+ 
+     generate_cont_thaw(true, true);
+ 
+     return start;
+   }
+ 
+   address generate_cont_interpreter_forced_preempt_return() {
+       StubCodeMark mark(this, "StubRoutines", "cont interpreter forced preempt return");
+       address start = __ pc();
+ 
+       // This is necessary for forced yields, as the return addres (in rbx) is captured in a call_VM, and skips the restoration of rbcp and locals
+       // see InterpreterMacroAssembler::restore_bcp/restore_locals
+       // TODO: use InterpreterMacroAssembler
+       static const Register _locals_register = r14;
+       static const Register _bcp_register    = r13;
+ 
+       __ pop(rbp);
+ 
+       __ movptr(_bcp_register,    Address(rbp, frame::interpreter_frame_bcp_offset    * wordSize));
+       __ movptr(_locals_register, Address(rbp, frame::interpreter_frame_locals_offset * wordSize));
+       // __ reinit_heapbase();
+ 
+       __ ret(0);
+ 
+       return start;
+     }
+ 
+ #if INCLUDE_JFR
+ 
+   static void jfr_set_last_java_frame(MacroAssembler* _masm) {
+     Register last_java_pc = c_rarg0;
+     Register last_java_sp = c_rarg2;
+     __ movptr(last_java_pc, Address(rsp, 0));
+     __ lea(last_java_sp, Address(rsp, wordSize));
+     __ vzeroupper();
+     Address anchor_java_pc(r15_thread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
+     __ movptr(anchor_java_pc, last_java_pc);
+     __ movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
+   }
+ 
+   static void jfr_prologue(MacroAssembler* _masm) {
+     jfr_set_last_java_frame(_masm);
+     __ movptr(c_rarg0, r15_thread);
+   }
+ 
+   // Handle is dereference here using correct load constructs.
+   static void jfr_epilogue(MacroAssembler* _masm) {
+     __ reset_last_Java_frame(false);
+     Label null_jobject;
+     __ testq(rax, rax);
+     __ jcc(Assembler::zero, null_jobject);
+     DecoratorSet decorators = ACCESS_READ | IN_NATIVE;
+     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
+     bs->load_at(_masm, decorators, T_OBJECT, rax, Address(rax, 0), c_rarg1, r15_thread);
+     __ bind(null_jobject);
+   }
+ 
+   // For c2: c_rarg0 is junk, c_rarg1 is the thread id. Call to runtime to write a checkpoint.
+   // Runtime will return a jobject handle to the event writer. The handle is dereferenced and the return value
+   // is the event writer oop.
+   address generate_jfr_write_checkpoint() {
+     StubCodeMark mark(this, "jfr_write_checkpoint", "JFR C2 support for Virtual Threads");
+ 
+     address start = __ pc();
+     jfr_prologue(_masm);
+     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JFR_WRITE_CHECKPOINT_FUNCTION), 2);
+     jfr_epilogue(_masm);
+     __ ret(0);
+ 
+     return start;
+   }
+ 
+   // For c1: call the corresponding runtime routine, it returns a jobject handle to the event writer.
+   // The handle is dereferenced and the return value is the event writer oop.
+   address generate_jfr_get_event_writer() {
+     StubCodeMark mark(this, "jfr_get_event_writer", "JFR C1 support for Virtual Threads");
+     address start = __ pc();
+ 
+     jfr_prologue(_masm);
+     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JFR_GET_EVENT_WRITER_FUNCTION), 1);
+     jfr_epilogue(_masm);
+     __ ret(0);
+ 
+     return start;
+   }
+ 
+ #endif // INCLUDE_JFR
+ 
  #undef __
  #define __ masm->
  
    // Continuation point for throwing of implicit exceptions that are
    // not handled in the current activation. Fabricates an exception

*** 7643,10 ***
--- 8366,24 ---
      generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
                                                         &StubRoutines::_safefetchN_fault_pc,
                                                         &StubRoutines::_safefetchN_continuation_pc);
    }
  
+   void generate_phase1() {
+     // Continuation stubs:
+     StubRoutines::_cont_thaw          = generate_cont_thaw();
+     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
+     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
+     StubRoutines::_cont_doYield_stub = generate_cont_doYield();
+     StubRoutines::_cont_doYield    = StubRoutines::_cont_doYield_stub->entry_point();
+     StubRoutines::_cont_jump_from_sp = generate_cont_jump_from_safepoint();
+     StubRoutines::_cont_interpreter_forced_preempt_return = generate_cont_interpreter_forced_preempt_return();
+ 
+     JFR_ONLY(StubRoutines::_jfr_write_checkpoint = generate_jfr_write_checkpoint();)
+     JFR_ONLY(StubRoutines::_jfr_get_event_writer = generate_jfr_get_event_writer();)
+   }
+ 
    void generate_all() {
      // Generates all stubs and initializes the entry points
  
      // These entry points require SharedInfo::stack0 to be set up in
      // non-core builds and need to be relocatable, so they each

*** 7900,21 ***
        StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
      }
    }
  
   public:
!   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
!     if (all) {
-       generate_all();
-     } else {
        generate_initial();
      }
    }
  }; // end class declaration
  
  #define UCM_TABLE_MAX_ENTRIES 16
! void StubGenerator_generate(CodeBuffer* code, bool all) {
    if (UnsafeCopyMemory::_table == NULL) {
      UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
    }
!   StubGenerator g(code, all);
  }
--- 8637,89 ---
        StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
      }
    }
  
   public:
!   StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) {
!     if (phase == 0) {
        generate_initial();
+     } else if (phase == 1) {
+       generate_phase1(); // stubs that must be available for the interpreter
+     } else {
+       generate_all();
      }
    }
  }; // end class declaration
  
  #define UCM_TABLE_MAX_ENTRIES 16
! void StubGenerator_generate(CodeBuffer* code, int phase) {
    if (UnsafeCopyMemory::_table == NULL) {
      UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
    }
!   StubGenerator g(code, phase);
+ }
+ 
+ #undef __
+ #define __ masm->
+ 
+ // on exit, rsp points to the ContinuationEntry
+ // kills rax
+ OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
+   assert (ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
+   assert (in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
+   assert (in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
+ 
+   stack_slots += (int)ContinuationEntry::size()/wordSize;
+   __ subptr(rsp, (int32_t)ContinuationEntry::size()); // place Continuation metadata
+ 
+   OopMap* map = new OopMap(((int)ContinuationEntry::size() + wordSize)/ VMRegImpl::stack_slot_size, 0 /* arg_slots*/);
+   ContinuationEntry::setup_oopmap(map);
+ 
+   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
+   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
+   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
+ 
+   return map;
+ }
+ 
+ // on entry c_rarg1 points to the continuation 
+ //          rsp points to ContinuationEntry
+ // kills rax
+ void fill_continuation_entry(MacroAssembler* masm) {
+   DEBUG_ONLY(__ movl(Address(rsp, ContinuationEntry::cookie_offset()), 0x1234);)
+ 
+   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), c_rarg1);
+   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), (int32_t)0);
+   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), (int32_t)0);
+ 
+   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
+   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
+   __ movl(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
+   __ movl(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
+   
+   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
+   __ reset_held_monitor_count(r15_thread);
  }
+ 
+ // on entry, rsp points to the ContinuationEntry
+ // on exit, rsp points to the spilled rbp in the entry frame
+ // kills rbx, rcx
+ void continuation_enter_cleanup(MacroAssembler* masm) {
+ #ifndef PRODUCT
+   Label OK;
+   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
+   __ jcc(Assembler::equal, OK);
+   __ stop("incorrect rsp1");
+   __ bind(OK);
+ #endif
+   
+   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
+   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
+   __ movl(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
+   __ movl(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
+ 
+   __ movptr(rcx, Address(rsp, ContinuationEntry::parent_offset()));
+   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rcx);
+   __ addptr(rsp, (int32_t)ContinuationEntry::size());
+ }
+ 
+ #undef __
< prev index next >