1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/vframeArray.hpp"
  56 #include "runtime/vm_version.hpp"
  57 #include "utilities/align.hpp"
  58 #include "utilities/checkedCast.hpp"
  59 #include "utilities/formatBuffer.hpp"
  60 #include "vmreg_x86.inline.hpp"
  61 #ifdef COMPILER1
  62 #include "c1/c1_Runtime1.hpp"
  63 #endif
  64 #ifdef COMPILER2
  65 #include "opto/runtime.hpp"
  66 #endif
  67 #if INCLUDE_JVMCI
  68 #include "jvmci/jvmciJavaClasses.hpp"
  69 #endif
  70 
  71 #define __ masm->
  72 
  73 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  74 
  75 class SimpleRuntimeFrame {
  76 
  77   public:
  78 
  79   // Most of the runtime stubs have this simple frame layout.
  80   // This class exists to make the layout shared in one place.
  81   // Offsets are for compiler stack slots, which are jints.
  82   enum layout {
  83     // The frame sender code expects that rbp will be in the "natural" place and
  84     // will override any oopMap setting for it. We must therefore force the layout
  85     // so that it agrees with the frame sender code.
  86     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  87     rbp_off2,
  88     return_off, return_off2,
  89     framesize
  90   };
  91 };
  92 
  93 class RegisterSaver {
  94   // Capture info about frame layout.  Layout offsets are in jint
  95   // units because compiler frame slots are jints.
  96 #define XSAVE_AREA_BEGIN 160
  97 #define XSAVE_AREA_YMM_BEGIN 576
  98 #define XSAVE_AREA_OPMASK_BEGIN 1088
  99 #define XSAVE_AREA_ZMM_BEGIN 1152
 100 #define XSAVE_AREA_UPPERBANK 1664
 101 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 102 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 103 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 104 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 105 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 106   enum layout {
 107     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 108     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 109     DEF_XMM_OFFS(0),
 110     DEF_XMM_OFFS(1),
 111     // 2..15 are implied in range usage
 112     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 113     DEF_YMM_OFFS(0),
 114     DEF_YMM_OFFS(1),
 115     // 2..15 are implied in range usage
 116     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 117     DEF_OPMASK_OFFS(0),
 118     DEF_OPMASK_OFFS(1),
 119     // 2..7 are implied in range usage
 120     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 121     DEF_ZMM_OFFS(0),
 122     DEF_ZMM_OFFS(1),
 123     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 124     DEF_ZMM_UPPER_OFFS(16),
 125     DEF_ZMM_UPPER_OFFS(17),
 126     // 18..31 are implied in range usage
 127     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 128     fpu_stateH_end,
 129     r15_off, r15H_off,
 130     r14_off, r14H_off,
 131     r13_off, r13H_off,
 132     r12_off, r12H_off,
 133     r11_off, r11H_off,
 134     r10_off, r10H_off,
 135     r9_off,  r9H_off,
 136     r8_off,  r8H_off,
 137     rdi_off, rdiH_off,
 138     rsi_off, rsiH_off,
 139     ignore_off, ignoreH_off,  // extra copy of rbp
 140     rsp_off, rspH_off,
 141     rbx_off, rbxH_off,
 142     rdx_off, rdxH_off,
 143     rcx_off, rcxH_off,
 144     rax_off, raxH_off,
 145     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 146     align_off, alignH_off,
 147     flags_off, flagsH_off,
 148     // The frame sender code expects that rbp will be in the "natural" place and
 149     // will override any oopMap setting for it. We must therefore force the layout
 150     // so that it agrees with the frame sender code.
 151     rbp_off, rbpH_off,        // copy of rbp we will restore
 152     return_off, returnH_off,  // slot for return address
 153     reg_save_size             // size in compiler stack slots
 154   };
 155 
 156  public:
 157   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 158   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 159 
 160   // Offsets into the register save area
 161   // Used by deoptimization when it is managing result register
 162   // values on its own
 163 
 164   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 165   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 166   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 167   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 168   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 169 
 170   // During deoptimization only the result registers need to be restored,
 171   // all the other values have already been extracted.
 172   static void restore_result_registers(MacroAssembler* masm);
 173 };
 174 
 175 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 176   int off = 0;
 177   int num_xmm_regs = XMMRegister::available_xmm_registers();
 178 #if COMPILER2_OR_JVMCI
 179   if (save_wide_vectors && UseAVX == 0) {
 180     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 181   }
 182   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 183 #else
 184   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 185 #endif
 186 
 187   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 188   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 189   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 190   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 191   // CodeBlob frame size is in words.
 192   int frame_size_in_words = frame_size_in_bytes / wordSize;
 193   *total_frame_words = frame_size_in_words;
 194 
 195   // Save registers, fpu state, and flags.
 196   // We assume caller has already pushed the return address onto the
 197   // stack, so rsp is 8-byte aligned here.
 198   // We push rpb twice in this sequence because we want the real rbp
 199   // to be under the return like a normal enter.
 200 
 201   __ enter();          // rsp becomes 16-byte aligned here
 202   __ push_CPU_state(); // Push a multiple of 16 bytes
 203 
 204   // push cpu state handles this on EVEX enabled targets
 205   if (save_wide_vectors) {
 206     // Save upper half of YMM registers(0..15)
 207     int base_addr = XSAVE_AREA_YMM_BEGIN;
 208     for (int n = 0; n < 16; n++) {
 209       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 210     }
 211     if (VM_Version::supports_evex()) {
 212       // Save upper half of ZMM registers(0..15)
 213       base_addr = XSAVE_AREA_ZMM_BEGIN;
 214       for (int n = 0; n < 16; n++) {
 215         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 216       }
 217       // Save full ZMM registers(16..num_xmm_regs)
 218       base_addr = XSAVE_AREA_UPPERBANK;
 219       off = 0;
 220       int vector_len = Assembler::AVX_512bit;
 221       for (int n = 16; n < num_xmm_regs; n++) {
 222         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 223       }
 224 #if COMPILER2_OR_JVMCI
 225       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 226       off = 0;
 227       for(int n = 0; n < KRegister::number_of_registers; n++) {
 228         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 229       }
 230 #endif
 231     }
 232   } else {
 233     if (VM_Version::supports_evex()) {
 234       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 235       int base_addr = XSAVE_AREA_UPPERBANK;
 236       off = 0;
 237       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 238       for (int n = 16; n < num_xmm_regs; n++) {
 239         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 240       }
 241 #if COMPILER2_OR_JVMCI
 242       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 243       off = 0;
 244       for(int n = 0; n < KRegister::number_of_registers; n++) {
 245         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 246       }
 247 #endif
 248     }
 249   }
 250   __ vzeroupper();
 251   if (frame::arg_reg_save_area_bytes != 0) {
 252     // Allocate argument register save area
 253     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 254   }
 255 
 256   // Set an oopmap for the call site.  This oopmap will map all
 257   // oop-registers and debug-info registers as callee-saved.  This
 258   // will allow deoptimization at this safepoint to find all possible
 259   // debug-info recordings, as well as let GC find all oops.
 260 
 261   OopMapSet *oop_maps = new OopMapSet();
 262   OopMap* map = new OopMap(frame_size_in_slots, 0);
 263 
 264 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 265 
 266   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 267   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 270   // rbp location is known implicitly by the frame sender code, needs no oopmap
 271   // and the location where rbp was saved by is ignored
 272   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 273   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 282   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 283   // on EVEX enabled targets, we get it included in the xsave area
 284   off = xmm0_off;
 285   int delta = xmm1_off - off;
 286   for (int n = 0; n < 16; n++) {
 287     XMMRegister xmm_name = as_XMMRegister(n);
 288     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 289     off += delta;
 290   }
 291   if (UseAVX > 2) {
 292     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 293     off = zmm16_off;
 294     delta = zmm17_off - off;
 295     for (int n = 16; n < num_xmm_regs; n++) {
 296       XMMRegister zmm_name = as_XMMRegister(n);
 297       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 298       off += delta;
 299     }
 300   }
 301 
 302 #if COMPILER2_OR_JVMCI
 303   if (save_wide_vectors) {
 304     // Save upper half of YMM registers(0..15)
 305     off = ymm0_off;
 306     delta = ymm1_off - ymm0_off;
 307     for (int n = 0; n < 16; n++) {
 308       XMMRegister ymm_name = as_XMMRegister(n);
 309       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 310       off += delta;
 311     }
 312     if (VM_Version::supports_evex()) {
 313       // Save upper half of ZMM registers(0..15)
 314       off = zmm0_off;
 315       delta = zmm1_off - zmm0_off;
 316       for (int n = 0; n < 16; n++) {
 317         XMMRegister zmm_name = as_XMMRegister(n);
 318         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 319         off += delta;
 320       }
 321     }
 322   }
 323 #endif // COMPILER2_OR_JVMCI
 324 
 325   // %%% These should all be a waste but we'll keep things as they were for now
 326   if (true) {
 327     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 328     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 331     // rbp location is known implicitly by the frame sender code, needs no oopmap
 332     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 342     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 343     // on EVEX enabled targets, we get it included in the xsave area
 344     off = xmm0H_off;
 345     delta = xmm1H_off - off;
 346     for (int n = 0; n < 16; n++) {
 347       XMMRegister xmm_name = as_XMMRegister(n);
 348       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 349       off += delta;
 350     }
 351     if (UseAVX > 2) {
 352       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 353       off = zmm16H_off;
 354       delta = zmm17H_off - off;
 355       for (int n = 16; n < num_xmm_regs; n++) {
 356         XMMRegister zmm_name = as_XMMRegister(n);
 357         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 358         off += delta;
 359       }
 360     }
 361   }
 362 
 363   return map;
 364 }
 365 
 366 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 367   int num_xmm_regs = XMMRegister::available_xmm_registers();
 368   if (frame::arg_reg_save_area_bytes != 0) {
 369     // Pop arg register save area
 370     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 371   }
 372 
 373 #if COMPILER2_OR_JVMCI
 374   if (restore_wide_vectors) {
 375     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 376     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 377   }
 378 #else
 379   assert(!restore_wide_vectors, "vectors are generated only by C2");
 380 #endif
 381 
 382   __ vzeroupper();
 383 
 384   // On EVEX enabled targets everything is handled in pop fpu state
 385   if (restore_wide_vectors) {
 386     // Restore upper half of YMM registers (0..15)
 387     int base_addr = XSAVE_AREA_YMM_BEGIN;
 388     for (int n = 0; n < 16; n++) {
 389       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 390     }
 391     if (VM_Version::supports_evex()) {
 392       // Restore upper half of ZMM registers (0..15)
 393       base_addr = XSAVE_AREA_ZMM_BEGIN;
 394       for (int n = 0; n < 16; n++) {
 395         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 396       }
 397       // Restore full ZMM registers(16..num_xmm_regs)
 398       base_addr = XSAVE_AREA_UPPERBANK;
 399       int vector_len = Assembler::AVX_512bit;
 400       int off = 0;
 401       for (int n = 16; n < num_xmm_regs; n++) {
 402         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 403       }
 404 #if COMPILER2_OR_JVMCI
 405       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 406       off = 0;
 407       for (int n = 0; n < KRegister::number_of_registers; n++) {
 408         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 409       }
 410 #endif
 411     }
 412   } else {
 413     if (VM_Version::supports_evex()) {
 414       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 415       int base_addr = XSAVE_AREA_UPPERBANK;
 416       int off = 0;
 417       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 418       for (int n = 16; n < num_xmm_regs; n++) {
 419         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 420       }
 421 #if COMPILER2_OR_JVMCI
 422       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 423       off = 0;
 424       for (int n = 0; n < KRegister::number_of_registers; n++) {
 425         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 426       }
 427 #endif
 428     }
 429   }
 430 
 431   // Recover CPU state
 432   __ pop_CPU_state();
 433   // Get the rbp described implicitly by the calling convention (no oopMap)
 434   __ pop(rbp);
 435 }
 436 
 437 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 438 
 439   // Just restore result register. Only used by deoptimization. By
 440   // now any callee save register that needs to be restored to a c2
 441   // caller of the deoptee has been extracted into the vframeArray
 442   // and will be stuffed into the c2i adapter we create for later
 443   // restoration so only result registers need to be restored here.
 444 
 445   // Restore fp result register
 446   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 447   // Restore integer result register
 448   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 449   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 450 
 451   // Pop all of the register save are off the stack except the return address
 452   __ addptr(rsp, return_offset_in_bytes());
 453 }
 454 
 455 // Is vector's size (in bytes) bigger than a size saved by default?
 456 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 457 bool SharedRuntime::is_wide_vector(int size) {
 458   return size > 16;
 459 }
 460 
 461 // ---------------------------------------------------------------------------
 462 // Read the array of BasicTypes from a signature, and compute where the
 463 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 464 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 465 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 466 // as framesizes are fixed.
 467 // VMRegImpl::stack0 refers to the first slot 0(sp).
 468 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 469 // Register up to Register::number_of_registers are the 64-bit
 470 // integer registers.
 471 
 472 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 473 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 474 // units regardless of build. Of course for i486 there is no 64 bit build
 475 
 476 // The Java calling convention is a "shifted" version of the C ABI.
 477 // By skipping the first C ABI register we can call non-static jni methods
 478 // with small numbers of arguments without having to shuffle the arguments
 479 // at all. Since we control the java ABI we ought to at least get some
 480 // advantage out of it.
 481 
 482 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 483                                            VMRegPair *regs,
 484                                            int total_args_passed) {
 485 
 486   // Create the mapping between argument positions and
 487   // registers.
 488   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 489     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 490   };
 491   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 492     j_farg0, j_farg1, j_farg2, j_farg3,
 493     j_farg4, j_farg5, j_farg6, j_farg7
 494   };
 495 
 496 
 497   uint int_args = 0;
 498   uint fp_args = 0;
 499   uint stk_args = 0;
 500 
 501   for (int i = 0; i < total_args_passed; i++) {
 502     switch (sig_bt[i]) {
 503     case T_BOOLEAN:
 504     case T_CHAR:
 505     case T_BYTE:
 506     case T_SHORT:
 507     case T_INT:
 508       if (int_args < Argument::n_int_register_parameters_j) {
 509         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 510       } else {
 511         stk_args = align_up(stk_args, 2);
 512         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 513         stk_args += 1;
 514       }
 515       break;
 516     case T_VOID:
 517       // halves of T_LONG or T_DOUBLE
 518       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 519       regs[i].set_bad();
 520       break;
 521     case T_LONG:
 522       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 523       // fall through
 524     case T_OBJECT:
 525     case T_ARRAY:
 526     case T_ADDRESS:
 527       if (int_args < Argument::n_int_register_parameters_j) {
 528         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 529       } else {
 530         stk_args = align_up(stk_args, 2);
 531         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 532         stk_args += 2;
 533       }
 534       break;
 535     case T_FLOAT:
 536       if (fp_args < Argument::n_float_register_parameters_j) {
 537         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 538       } else {
 539         stk_args = align_up(stk_args, 2);
 540         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 541         stk_args += 1;
 542       }
 543       break;
 544     case T_DOUBLE:
 545       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 546       if (fp_args < Argument::n_float_register_parameters_j) {
 547         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 548       } else {
 549         stk_args = align_up(stk_args, 2);
 550         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 551         stk_args += 2;
 552       }
 553       break;
 554     default:
 555       ShouldNotReachHere();
 556       break;
 557     }
 558   }
 559 
 560   return stk_args;
 561 }
 562 
 563 // Patch the callers callsite with entry to compiled code if it exists.
 564 static void patch_callers_callsite(MacroAssembler *masm) {
 565   Label L;
 566   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 567   __ jcc(Assembler::equal, L);
 568 
 569   // Save the current stack pointer
 570   __ mov(r13, rsp);
 571   // Schedule the branch target address early.
 572   // Call into the VM to patch the caller, then jump to compiled callee
 573   // rax isn't live so capture return address while we easily can
 574   __ movptr(rax, Address(rsp, 0));
 575 
 576   // align stack so push_CPU_state doesn't fault
 577   __ andptr(rsp, -(StackAlignmentInBytes));
 578   __ push_CPU_state();
 579   __ vzeroupper();
 580   // VM needs caller's callsite
 581   // VM needs target method
 582   // This needs to be a long call since we will relocate this adapter to
 583   // the codeBuffer and it may not reach
 584 
 585   // Allocate argument register save area
 586   if (frame::arg_reg_save_area_bytes != 0) {
 587     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 588   }
 589   __ mov(c_rarg0, rbx);
 590   __ mov(c_rarg1, rax);
 591   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 592 
 593   // De-allocate argument register save area
 594   if (frame::arg_reg_save_area_bytes != 0) {
 595     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 596   }
 597 
 598   __ vzeroupper();
 599   __ pop_CPU_state();
 600   // restore sp
 601   __ mov(rsp, r13);
 602   __ bind(L);
 603 }
 604 
 605 
 606 static void gen_c2i_adapter(MacroAssembler *masm,
 607                             int total_args_passed,
 608                             int comp_args_on_stack,
 609                             const BasicType *sig_bt,
 610                             const VMRegPair *regs,
 611                             Label& skip_fixup) {
 612   // Before we get into the guts of the C2I adapter, see if we should be here
 613   // at all.  We've come from compiled code and are attempting to jump to the
 614   // interpreter, which means the caller made a static call to get here
 615   // (vcalls always get a compiled target if there is one).  Check for a
 616   // compiled target.  If there is one, we need to patch the caller's call.
 617   patch_callers_callsite(masm);
 618 
 619   __ bind(skip_fixup);
 620 
 621   // Since all args are passed on the stack, total_args_passed *
 622   // Interpreter::stackElementSize is the space we need.
 623 
 624   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 625 
 626   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 627 
 628   // stack is aligned, keep it that way
 629   // This is not currently needed or enforced by the interpreter, but
 630   // we might as well conform to the ABI.
 631   extraspace = align_up(extraspace, 2*wordSize);
 632 
 633   // set senderSP value
 634   __ lea(r13, Address(rsp, wordSize));
 635 
 636 #ifdef ASSERT
 637   __ check_stack_alignment(r13, "sender stack not aligned");
 638 #endif
 639   if (extraspace > 0) {
 640     // Pop the return address
 641     __ pop(rax);
 642 
 643     __ subptr(rsp, extraspace);
 644 
 645     // Push the return address
 646     __ push(rax);
 647 
 648     // Account for the return address location since we store it first rather
 649     // than hold it in a register across all the shuffling
 650     extraspace += wordSize;
 651   }
 652 
 653 #ifdef ASSERT
 654   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 655 #endif
 656 
 657   // Now write the args into the outgoing interpreter space
 658   for (int i = 0; i < total_args_passed; i++) {
 659     if (sig_bt[i] == T_VOID) {
 660       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 661       continue;
 662     }
 663 
 664     // offset to start parameters
 665     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 666     int next_off = st_off - Interpreter::stackElementSize;
 667 
 668     // Say 4 args:
 669     // i   st_off
 670     // 0   32 T_LONG
 671     // 1   24 T_VOID
 672     // 2   16 T_OBJECT
 673     // 3    8 T_BOOL
 674     // -    0 return address
 675     //
 676     // However to make thing extra confusing. Because we can fit a long/double in
 677     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 678     // leaves one slot empty and only stores to a single slot. In this case the
 679     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 680 
 681     VMReg r_1 = regs[i].first();
 682     VMReg r_2 = regs[i].second();
 683     if (!r_1->is_valid()) {
 684       assert(!r_2->is_valid(), "");
 685       continue;
 686     }
 687     if (r_1->is_stack()) {
 688       // memory to memory use rax
 689       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 690       if (!r_2->is_valid()) {
 691         // sign extend??
 692         __ movl(rax, Address(rsp, ld_off));
 693         __ movptr(Address(rsp, st_off), rax);
 694 
 695       } else {
 696 
 697         __ movq(rax, Address(rsp, ld_off));
 698 
 699         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 700         // T_DOUBLE and T_LONG use two slots in the interpreter
 701         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 702           // ld_off == LSW, ld_off+wordSize == MSW
 703           // st_off == MSW, next_off == LSW
 704           __ movq(Address(rsp, next_off), rax);
 705 #ifdef ASSERT
 706           // Overwrite the unused slot with known junk
 707           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 708           __ movptr(Address(rsp, st_off), rax);
 709 #endif /* ASSERT */
 710         } else {
 711           __ movq(Address(rsp, st_off), rax);
 712         }
 713       }
 714     } else if (r_1->is_Register()) {
 715       Register r = r_1->as_Register();
 716       if (!r_2->is_valid()) {
 717         // must be only an int (or less ) so move only 32bits to slot
 718         // why not sign extend??
 719         __ movl(Address(rsp, st_off), r);
 720       } else {
 721         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 722         // T_DOUBLE and T_LONG use two slots in the interpreter
 723         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 724           // long/double in gpr
 725 #ifdef ASSERT
 726           // Overwrite the unused slot with known junk
 727           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 728           __ movptr(Address(rsp, st_off), rax);
 729 #endif /* ASSERT */
 730           __ movq(Address(rsp, next_off), r);
 731         } else {
 732           __ movptr(Address(rsp, st_off), r);
 733         }
 734       }
 735     } else {
 736       assert(r_1->is_XMMRegister(), "");
 737       if (!r_2->is_valid()) {
 738         // only a float use just part of the slot
 739         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 740       } else {
 741 #ifdef ASSERT
 742         // Overwrite the unused slot with known junk
 743         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 744         __ movptr(Address(rsp, st_off), rax);
 745 #endif /* ASSERT */
 746         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 747       }
 748     }
 749   }
 750 
 751   // Schedule the branch target address early.
 752   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 753   __ jmp(rcx);
 754 }
 755 
 756 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 757                         address code_start, address code_end,
 758                         Label& L_ok) {
 759   Label L_fail;
 760   __ lea(temp_reg, ExternalAddress(code_start));
 761   __ cmpptr(pc_reg, temp_reg);
 762   __ jcc(Assembler::belowEqual, L_fail);
 763   __ lea(temp_reg, ExternalAddress(code_end));
 764   __ cmpptr(pc_reg, temp_reg);
 765   __ jcc(Assembler::below, L_ok);
 766   __ bind(L_fail);
 767 }
 768 
 769 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 770                                     int total_args_passed,
 771                                     int comp_args_on_stack,
 772                                     const BasicType *sig_bt,
 773                                     const VMRegPair *regs) {
 774 
 775   // Note: r13 contains the senderSP on entry. We must preserve it since
 776   // we may do a i2c -> c2i transition if we lose a race where compiled
 777   // code goes non-entrant while we get args ready.
 778   // In addition we use r13 to locate all the interpreter args as
 779   // we must align the stack to 16 bytes on an i2c entry else we
 780   // lose alignment we expect in all compiled code and register
 781   // save code can segv when fxsave instructions find improperly
 782   // aligned stack pointer.
 783 
 784   // Adapters can be frameless because they do not require the caller
 785   // to perform additional cleanup work, such as correcting the stack pointer.
 786   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 787   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 788   // even if a callee has modified the stack pointer.
 789   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 790   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 791   // up via the senderSP register).
 792   // In other words, if *either* the caller or callee is interpreted, we can
 793   // get the stack pointer repaired after a call.
 794   // This is why c2i and i2c adapters cannot be indefinitely composed.
 795   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 796   // both caller and callee would be compiled methods, and neither would
 797   // clean up the stack pointer changes performed by the two adapters.
 798   // If this happens, control eventually transfers back to the compiled
 799   // caller, but with an uncorrected stack, causing delayed havoc.
 800 
 801   if (VerifyAdapterCalls &&
 802       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 803     // So, let's test for cascading c2i/i2c adapters right now.
 804     //  assert(Interpreter::contains($return_addr) ||
 805     //         StubRoutines::contains($return_addr),
 806     //         "i2c adapter must return to an interpreter frame");
 807     __ block_comment("verify_i2c { ");
 808     // Pick up the return address
 809     __ movptr(rax, Address(rsp, 0));
 810     Label L_ok;
 811     if (Interpreter::code() != nullptr) {
 812       range_check(masm, rax, r11,
 813                   Interpreter::code()->code_start(),
 814                   Interpreter::code()->code_end(),
 815                   L_ok);
 816     }
 817     if (StubRoutines::initial_stubs_code() != nullptr) {
 818       range_check(masm, rax, r11,
 819                   StubRoutines::initial_stubs_code()->code_begin(),
 820                   StubRoutines::initial_stubs_code()->code_end(),
 821                   L_ok);
 822     }
 823     if (StubRoutines::final_stubs_code() != nullptr) {
 824       range_check(masm, rax, r11,
 825                   StubRoutines::final_stubs_code()->code_begin(),
 826                   StubRoutines::final_stubs_code()->code_end(),
 827                   L_ok);
 828     }
 829     const char* msg = "i2c adapter must return to an interpreter frame";
 830     __ block_comment(msg);
 831     __ stop(msg);
 832     __ bind(L_ok);
 833     __ block_comment("} verify_i2ce ");
 834   }
 835 
 836   // Must preserve original SP for loading incoming arguments because
 837   // we need to align the outgoing SP for compiled code.
 838   __ movptr(r11, rsp);
 839 
 840   // Pick up the return address
 841   __ pop(rax);
 842 
 843   // Convert 4-byte c2 stack slots to words.
 844   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 845 
 846   if (comp_args_on_stack) {
 847     __ subptr(rsp, comp_words_on_stack * wordSize);
 848   }
 849 
 850   // Ensure compiled code always sees stack at proper alignment
 851   __ andptr(rsp, -16);
 852 
 853   // push the return address and misalign the stack that youngest frame always sees
 854   // as far as the placement of the call instruction
 855   __ push(rax);
 856 
 857   // Put saved SP in another register
 858   const Register saved_sp = rax;
 859   __ movptr(saved_sp, r11);
 860 
 861   // Will jump to the compiled code just as if compiled code was doing it.
 862   // Pre-load the register-jump target early, to schedule it better.
 863   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 864 
 865 #if INCLUDE_JVMCI
 866   if (EnableJVMCI) {
 867     // check if this call should be routed towards a specific entry point
 868     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 869     Label no_alternative_target;
 870     __ jcc(Assembler::equal, no_alternative_target);
 871     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 872     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 873     __ bind(no_alternative_target);
 874   }
 875 #endif // INCLUDE_JVMCI
 876 
 877   // Now generate the shuffle code.  Pick up all register args and move the
 878   // rest through the floating point stack top.
 879   for (int i = 0; i < total_args_passed; i++) {
 880     if (sig_bt[i] == T_VOID) {
 881       // Longs and doubles are passed in native word order, but misaligned
 882       // in the 32-bit build.
 883       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 884       continue;
 885     }
 886 
 887     // Pick up 0, 1 or 2 words from SP+offset.
 888 
 889     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 890             "scrambled load targets?");
 891     // Load in argument order going down.
 892     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 893     // Point to interpreter value (vs. tag)
 894     int next_off = ld_off - Interpreter::stackElementSize;
 895     //
 896     //
 897     //
 898     VMReg r_1 = regs[i].first();
 899     VMReg r_2 = regs[i].second();
 900     if (!r_1->is_valid()) {
 901       assert(!r_2->is_valid(), "");
 902       continue;
 903     }
 904     if (r_1->is_stack()) {
 905       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 906       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 907 
 908       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 909       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 910       // will be generated.
 911       if (!r_2->is_valid()) {
 912         // sign extend???
 913         __ movl(r13, Address(saved_sp, ld_off));
 914         __ movptr(Address(rsp, st_off), r13);
 915       } else {
 916         //
 917         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 918         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 919         // So we must adjust where to pick up the data to match the interpreter.
 920         //
 921         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 922         // are accessed as negative so LSW is at LOW address
 923 
 924         // ld_off is MSW so get LSW
 925         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 926                            next_off : ld_off;
 927         __ movq(r13, Address(saved_sp, offset));
 928         // st_off is LSW (i.e. reg.first())
 929         __ movq(Address(rsp, st_off), r13);
 930       }
 931     } else if (r_1->is_Register()) {  // Register argument
 932       Register r = r_1->as_Register();
 933       assert(r != rax, "must be different");
 934       if (r_2->is_valid()) {
 935         //
 936         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 937         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 938         // So we must adjust where to pick up the data to match the interpreter.
 939 
 940         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 941                            next_off : ld_off;
 942 
 943         // this can be a misaligned move
 944         __ movq(r, Address(saved_sp, offset));
 945       } else {
 946         // sign extend and use a full word?
 947         __ movl(r, Address(saved_sp, ld_off));
 948       }
 949     } else {
 950       if (!r_2->is_valid()) {
 951         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 952       } else {
 953         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 954       }
 955     }
 956   }
 957 
 958   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 959 
 960   // 6243940 We might end up in handle_wrong_method if
 961   // the callee is deoptimized as we race thru here. If that
 962   // happens we don't want to take a safepoint because the
 963   // caller frame will look interpreted and arguments are now
 964   // "compiled" so it is much better to make this transition
 965   // invisible to the stack walking code. Unfortunately if
 966   // we try and find the callee by normal means a safepoint
 967   // is possible. So we stash the desired callee in the thread
 968   // and the vm will find there should this case occur.
 969 
 970   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 971 
 972   // put Method* where a c2i would expect should we end up there
 973   // only needed because eof c2 resolve stubs return Method* as a result in
 974   // rax
 975   __ mov(rax, rbx);
 976   __ jmp(r11);
 977 }
 978 
 979 // ---------------------------------------------------------------
 980 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 981                                                             int total_args_passed,
 982                                                             int comp_args_on_stack,
 983                                                             const BasicType *sig_bt,
 984                                                             const VMRegPair *regs,
 985                                                             AdapterFingerPrint* fingerprint) {
 986   address i2c_entry = __ pc();
 987 
 988   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 989 
 990   // -------------------------------------------------------------------------
 991   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 992   // to the interpreter.  The args start out packed in the compiled layout.  They
 993   // need to be unpacked into the interpreter layout.  This will almost always
 994   // require some stack space.  We grow the current (compiled) stack, then repack
 995   // the args.  We  finally end in a jump to the generic interpreter entry point.
 996   // On exit from the interpreter, the interpreter will restore our SP (lest the
 997   // compiled code, which relies solely on SP and not RBP, get sick).
 998 
 999   address c2i_unverified_entry = __ pc();
1000   Label skip_fixup;
1001 
1002   Register data = rax;
1003   Register receiver = j_rarg0;
1004   Register temp = rbx;
1005 
1006   {
1007     __ ic_check(1 /* end_alignment */);
1008     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1009     // Method might have been compiled since the call site was patched to
1010     // interpreted if that is the case treat it as a miss so we can get
1011     // the call site corrected.
1012     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1013     __ jcc(Assembler::equal, skip_fixup);
1014     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1015   }
1016 
1017   address c2i_entry = __ pc();
1018 
1019   // Class initialization barrier for static methods
1020   address c2i_no_clinit_check_entry = nullptr;
1021   if (VM_Version::supports_fast_class_init_checks()) {
1022     Label L_skip_barrier;
1023     Register method = rbx;
1024 
1025     { // Bypass the barrier for non-static methods
1026       Register flags = rscratch1;
1027       __ movl(flags, Address(method, Method::access_flags_offset()));
1028       __ testl(flags, JVM_ACC_STATIC);
1029       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1030     }
1031 
1032     Register klass = rscratch1;
1033     __ load_method_holder(klass, method);
1034     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1035 
1036     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1037 
1038     __ bind(L_skip_barrier);
1039     c2i_no_clinit_check_entry = __ pc();
1040   }
1041 
1042   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1043   bs->c2i_entry_barrier(masm);
1044 
1045   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1046 
1047   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1048 }
1049 
1050 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1051                                          VMRegPair *regs,
1052                                          int total_args_passed) {
1053 
1054 // We return the amount of VMRegImpl stack slots we need to reserve for all
1055 // the arguments NOT counting out_preserve_stack_slots.
1056 
1057 // NOTE: These arrays will have to change when c1 is ported
1058 #ifdef _WIN64
1059     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1060       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1061     };
1062     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1063       c_farg0, c_farg1, c_farg2, c_farg3
1064     };
1065 #else
1066     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1067       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1068     };
1069     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1070       c_farg0, c_farg1, c_farg2, c_farg3,
1071       c_farg4, c_farg5, c_farg6, c_farg7
1072     };
1073 #endif // _WIN64
1074 
1075 
1076     uint int_args = 0;
1077     uint fp_args = 0;
1078     uint stk_args = 0; // inc by 2 each time
1079 
1080     for (int i = 0; i < total_args_passed; i++) {
1081       switch (sig_bt[i]) {
1082       case T_BOOLEAN:
1083       case T_CHAR:
1084       case T_BYTE:
1085       case T_SHORT:
1086       case T_INT:
1087         if (int_args < Argument::n_int_register_parameters_c) {
1088           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1089 #ifdef _WIN64
1090           fp_args++;
1091           // Allocate slots for callee to stuff register args the stack.
1092           stk_args += 2;
1093 #endif
1094         } else {
1095           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1096           stk_args += 2;
1097         }
1098         break;
1099       case T_LONG:
1100         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1101         // fall through
1102       case T_OBJECT:
1103       case T_ARRAY:
1104       case T_ADDRESS:
1105       case T_METADATA:
1106         if (int_args < Argument::n_int_register_parameters_c) {
1107           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1108 #ifdef _WIN64
1109           fp_args++;
1110           stk_args += 2;
1111 #endif
1112         } else {
1113           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1114           stk_args += 2;
1115         }
1116         break;
1117       case T_FLOAT:
1118         if (fp_args < Argument::n_float_register_parameters_c) {
1119           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1120 #ifdef _WIN64
1121           int_args++;
1122           // Allocate slots for callee to stuff register args the stack.
1123           stk_args += 2;
1124 #endif
1125         } else {
1126           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1127           stk_args += 2;
1128         }
1129         break;
1130       case T_DOUBLE:
1131         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1132         if (fp_args < Argument::n_float_register_parameters_c) {
1133           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1134 #ifdef _WIN64
1135           int_args++;
1136           // Allocate slots for callee to stuff register args the stack.
1137           stk_args += 2;
1138 #endif
1139         } else {
1140           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1141           stk_args += 2;
1142         }
1143         break;
1144       case T_VOID: // Halves of longs and doubles
1145         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1146         regs[i].set_bad();
1147         break;
1148       default:
1149         ShouldNotReachHere();
1150         break;
1151       }
1152     }
1153 #ifdef _WIN64
1154   // windows abi requires that we always allocate enough stack space
1155   // for 4 64bit registers to be stored down.
1156   if (stk_args < 8) {
1157     stk_args = 8;
1158   }
1159 #endif // _WIN64
1160 
1161   return stk_args;
1162 }
1163 
1164 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1165                                              uint num_bits,
1166                                              uint total_args_passed) {
1167   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1168          "only certain vector sizes are supported for now");
1169 
1170   static const XMMRegister VEC_ArgReg[32] = {
1171      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1172      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1173     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1174     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1175   };
1176 
1177   uint stk_args = 0;
1178   uint fp_args = 0;
1179 
1180   for (uint i = 0; i < total_args_passed; i++) {
1181     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1182     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1183     regs[i].set_pair(vmreg->next(next_val), vmreg);
1184   }
1185 
1186   return stk_args;
1187 }
1188 
1189 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1190   // We always ignore the frame_slots arg and just use the space just below frame pointer
1191   // which by this time is free to use
1192   switch (ret_type) {
1193   case T_FLOAT:
1194     __ movflt(Address(rbp, -wordSize), xmm0);
1195     break;
1196   case T_DOUBLE:
1197     __ movdbl(Address(rbp, -wordSize), xmm0);
1198     break;
1199   case T_VOID:  break;
1200   default: {
1201     __ movptr(Address(rbp, -wordSize), rax);
1202     }
1203   }
1204 }
1205 
1206 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1207   // We always ignore the frame_slots arg and just use the space just below frame pointer
1208   // which by this time is free to use
1209   switch (ret_type) {
1210   case T_FLOAT:
1211     __ movflt(xmm0, Address(rbp, -wordSize));
1212     break;
1213   case T_DOUBLE:
1214     __ movdbl(xmm0, Address(rbp, -wordSize));
1215     break;
1216   case T_VOID:  break;
1217   default: {
1218     __ movptr(rax, Address(rbp, -wordSize));
1219     }
1220   }
1221 }
1222 
1223 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1224     for ( int i = first_arg ; i < arg_count ; i++ ) {
1225       if (args[i].first()->is_Register()) {
1226         __ push(args[i].first()->as_Register());
1227       } else if (args[i].first()->is_XMMRegister()) {
1228         __ subptr(rsp, 2*wordSize);
1229         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1230       }
1231     }
1232 }
1233 
1234 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1235     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1236       if (args[i].first()->is_Register()) {
1237         __ pop(args[i].first()->as_Register());
1238       } else if (args[i].first()->is_XMMRegister()) {
1239         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1240         __ addptr(rsp, 2*wordSize);
1241       }
1242     }
1243 }
1244 
1245 static void verify_oop_args(MacroAssembler* masm,
1246                             const methodHandle& method,
1247                             const BasicType* sig_bt,
1248                             const VMRegPair* regs) {
1249   Register temp_reg = rbx;  // not part of any compiled calling seq
1250   if (VerifyOops) {
1251     for (int i = 0; i < method->size_of_parameters(); i++) {
1252       if (is_reference_type(sig_bt[i])) {
1253         VMReg r = regs[i].first();
1254         assert(r->is_valid(), "bad oop arg");
1255         if (r->is_stack()) {
1256           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1257           __ verify_oop(temp_reg);
1258         } else {
1259           __ verify_oop(r->as_Register());
1260         }
1261       }
1262     }
1263   }
1264 }
1265 
1266 static void check_continuation_enter_argument(VMReg actual_vmreg,
1267                                               Register expected_reg,
1268                                               const char* name) {
1269   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1270   assert(actual_vmreg->as_Register() == expected_reg,
1271          "%s is in unexpected register: %s instead of %s",
1272          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1273 }
1274 
1275 
1276 //---------------------------- continuation_enter_setup ---------------------------
1277 //
1278 // Arguments:
1279 //   None.
1280 //
1281 // Results:
1282 //   rsp: pointer to blank ContinuationEntry
1283 //
1284 // Kills:
1285 //   rax
1286 //
1287 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1288   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1289   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1290   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1291 
1292   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1293   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1294 
1295   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1296   OopMap* map = new OopMap(frame_size, 0);
1297 
1298   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1299   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1300   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1301 
1302   return map;
1303 }
1304 
1305 //---------------------------- fill_continuation_entry ---------------------------
1306 //
1307 // Arguments:
1308 //   rsp: pointer to blank Continuation entry
1309 //   reg_cont_obj: pointer to the continuation
1310 //   reg_flags: flags
1311 //
1312 // Results:
1313 //   rsp: pointer to filled out ContinuationEntry
1314 //
1315 // Kills:
1316 //   rax
1317 //
1318 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1319   assert_different_registers(rax, reg_cont_obj, reg_flags);
1320 #ifdef ASSERT
1321   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1322 #endif
1323   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1324   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1325   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1326   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1327   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1328 
1329   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1330   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1331   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1332   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1333 
1334   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1335   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1336 }
1337 
1338 //---------------------------- continuation_enter_cleanup ---------------------------
1339 //
1340 // Arguments:
1341 //   rsp: pointer to the ContinuationEntry
1342 //
1343 // Results:
1344 //   rsp: pointer to the spilled rbp in the entry frame
1345 //
1346 // Kills:
1347 //   rbx
1348 //
1349 void static continuation_enter_cleanup(MacroAssembler* masm) {
1350 #ifdef ASSERT
1351   Label L_good_sp;
1352   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1353   __ jcc(Assembler::equal, L_good_sp);
1354   __ stop("Incorrect rsp at continuation_enter_cleanup");
1355   __ bind(L_good_sp);
1356 #endif
1357   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1358   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1359 
1360   if (CheckJNICalls) {
1361     // Check if this is a virtual thread continuation
1362     Label L_skip_vthread_code;
1363     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1364     __ jcc(Assembler::equal, L_skip_vthread_code);
1365 
1366     // If the held monitor count is > 0 and this vthread is terminating then
1367     // it failed to release a JNI monitor. So we issue the same log message
1368     // that JavaThread::exit does.
1369     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1370     __ jcc(Assembler::equal, L_skip_vthread_code);
1371 
1372     // rax may hold an exception oop, save it before the call
1373     __ push(rax);
1374     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1375     __ pop(rax);
1376 
1377     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1378     // on termination. The held count is implicitly zeroed below when we restore from
1379     // the parent held count (which has to be zero).
1380     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1381 
1382     __ bind(L_skip_vthread_code);
1383   }
1384 #ifdef ASSERT
1385   else {
1386     // Check if this is a virtual thread continuation
1387     Label L_skip_vthread_code;
1388     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1389     __ jcc(Assembler::equal, L_skip_vthread_code);
1390 
1391     // See comment just above. If not checking JNI calls the JNI count is only
1392     // needed for assertion checking.
1393     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1394 
1395     __ bind(L_skip_vthread_code);
1396   }
1397 #endif
1398 
1399   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1400   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1401 
1402   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1403   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1404   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1405 }
1406 
1407 static void gen_continuation_enter(MacroAssembler* masm,
1408                                    const VMRegPair* regs,
1409                                    int& exception_offset,
1410                                    OopMapSet* oop_maps,
1411                                    int& frame_complete,
1412                                    int& stack_slots,
1413                                    int& interpreted_entry_offset,
1414                                    int& compiled_entry_offset) {
1415 
1416   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1417   int pos_cont_obj   = 0;
1418   int pos_is_cont    = 1;
1419   int pos_is_virtual = 2;
1420 
1421   // The platform-specific calling convention may present the arguments in various registers.
1422   // To simplify the rest of the code, we expect the arguments to reside at these known
1423   // registers, and we additionally check the placement here in case calling convention ever
1424   // changes.
1425   Register reg_cont_obj   = c_rarg1;
1426   Register reg_is_cont    = c_rarg2;
1427   Register reg_is_virtual = c_rarg3;
1428 
1429   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1430   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1431   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1432 
1433   // Utility methods kill rax, make sure there are no collisions
1434   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1435 
1436   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1437                          relocInfo::static_call_type);
1438 
1439   address start = __ pc();
1440 
1441   Label L_thaw, L_exit;
1442 
1443   // i2i entry used at interp_only_mode only
1444   interpreted_entry_offset = __ pc() - start;
1445   {
1446 #ifdef ASSERT
1447     Label is_interp_only;
1448     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1449     __ jcc(Assembler::notEqual, is_interp_only);
1450     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1451     __ bind(is_interp_only);
1452 #endif
1453 
1454     __ pop(rax); // return address
1455     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1456     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1457     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1458     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1459     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1460     __ push(rax); // return address
1461     __ push_cont_fastpath();
1462 
1463     __ enter();
1464 
1465     stack_slots = 2; // will be adjusted in setup
1466     OopMap* map = continuation_enter_setup(masm, stack_slots);
1467     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1468     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1469 
1470     __ verify_oop(reg_cont_obj);
1471 
1472     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1473 
1474     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1475     __ testptr(reg_is_cont, reg_is_cont);
1476     __ jcc(Assembler::notZero, L_thaw);
1477 
1478     // --- Resolve path
1479 
1480     // Make sure the call is patchable
1481     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1482     // Emit stub for static call
1483     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1484     if (stub == nullptr) {
1485       fatal("CodeCache is full at gen_continuation_enter");
1486     }
1487     __ call(resolve);
1488     oop_maps->add_gc_map(__ pc() - start, map);
1489     __ post_call_nop();
1490 
1491     __ jmp(L_exit);
1492   }
1493 
1494   // compiled entry
1495   __ align(CodeEntryAlignment);
1496   compiled_entry_offset = __ pc() - start;
1497   __ enter();
1498 
1499   stack_slots = 2; // will be adjusted in setup
1500   OopMap* map = continuation_enter_setup(masm, stack_slots);
1501 
1502   // Frame is now completed as far as size and linkage.
1503   frame_complete = __ pc() - start;
1504 
1505   __ verify_oop(reg_cont_obj);
1506 
1507   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1508 
1509   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1510   __ testptr(reg_is_cont, reg_is_cont);
1511   __ jccb(Assembler::notZero, L_thaw);
1512 
1513   // --- call Continuation.enter(Continuation c, boolean isContinue)
1514 
1515   // Make sure the call is patchable
1516   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1517 
1518   // Emit stub for static call
1519   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1520   if (stub == nullptr) {
1521     fatal("CodeCache is full at gen_continuation_enter");
1522   }
1523 
1524   // The call needs to be resolved. There's a special case for this in
1525   // SharedRuntime::find_callee_info_helper() which calls
1526   // LinkResolver::resolve_continuation_enter() which resolves the call to
1527   // Continuation.enter(Continuation c, boolean isContinue).
1528   __ call(resolve);
1529 
1530   oop_maps->add_gc_map(__ pc() - start, map);
1531   __ post_call_nop();
1532 
1533   __ jmpb(L_exit);
1534 
1535   // --- Thawing path
1536 
1537   __ bind(L_thaw);
1538 
1539   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1540 
1541   ContinuationEntry::_return_pc_offset = __ pc() - start;
1542   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1543   __ post_call_nop();
1544 
1545   // --- Normal exit (resolve/thawing)
1546 
1547   __ bind(L_exit);
1548 
1549   continuation_enter_cleanup(masm);
1550   __ pop(rbp);
1551   __ ret(0);
1552 
1553   // --- Exception handling path
1554 
1555   exception_offset = __ pc() - start;
1556 
1557   continuation_enter_cleanup(masm);
1558   __ pop(rbp);
1559 
1560   __ movptr(c_rarg0, r15_thread);
1561   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1562 
1563   // rax still holds the original exception oop, save it before the call
1564   __ push(rax);
1565 
1566   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1567   __ movptr(rbx, rax);
1568 
1569   // Continue at exception handler:
1570   //   rax: exception oop
1571   //   rbx: exception handler
1572   //   rdx: exception pc
1573   __ pop(rax);
1574   __ verify_oop(rax);
1575   __ pop(rdx);
1576   __ jmp(rbx);
1577 }
1578 
1579 static void gen_continuation_yield(MacroAssembler* masm,
1580                                    const VMRegPair* regs,
1581                                    OopMapSet* oop_maps,
1582                                    int& frame_complete,
1583                                    int& stack_slots,
1584                                    int& compiled_entry_offset) {
1585   enum layout {
1586     rbp_off,
1587     rbpH_off,
1588     return_off,
1589     return_off2,
1590     framesize // inclusive of return address
1591   };
1592   stack_slots = framesize /  VMRegImpl::slots_per_word;
1593   assert(stack_slots == 2, "recheck layout");
1594 
1595   address start = __ pc();
1596   compiled_entry_offset = __ pc() - start;
1597   __ enter();
1598   address the_pc = __ pc();
1599 
1600   frame_complete = the_pc - start;
1601 
1602   // This nop must be exactly at the PC we push into the frame info.
1603   // We use this nop for fast CodeBlob lookup, associate the OopMap
1604   // with it right away.
1605   __ post_call_nop();
1606   OopMap* map = new OopMap(framesize, 1);
1607   oop_maps->add_gc_map(frame_complete, map);
1608 
1609   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1610   __ movptr(c_rarg0, r15_thread);
1611   __ movptr(c_rarg1, rsp);
1612   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1613   __ reset_last_Java_frame(true);
1614 
1615   Label L_pinned;
1616 
1617   __ testptr(rax, rax);
1618   __ jcc(Assembler::notZero, L_pinned);
1619 
1620   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1621   continuation_enter_cleanup(masm);
1622   __ pop(rbp);
1623   __ ret(0);
1624 
1625   __ bind(L_pinned);
1626 
1627   // Pinned, return to caller
1628 
1629   // handle pending exception thrown by freeze
1630   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1631   Label ok;
1632   __ jcc(Assembler::equal, ok);
1633   __ leave();
1634   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1635   __ bind(ok);
1636 
1637   __ leave();
1638   __ ret(0);
1639 }
1640 
1641 static void gen_special_dispatch(MacroAssembler* masm,
1642                                  const methodHandle& method,
1643                                  const BasicType* sig_bt,
1644                                  const VMRegPair* regs) {
1645   verify_oop_args(masm, method, sig_bt, regs);
1646   vmIntrinsics::ID iid = method->intrinsic_id();
1647 
1648   // Now write the args into the outgoing interpreter space
1649   bool     has_receiver   = false;
1650   Register receiver_reg   = noreg;
1651   int      member_arg_pos = -1;
1652   Register member_reg     = noreg;
1653   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1654   if (ref_kind != 0) {
1655     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1656     member_reg = rbx;  // known to be free at this point
1657     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1658   } else if (iid == vmIntrinsics::_invokeBasic) {
1659     has_receiver = true;
1660   } else if (iid == vmIntrinsics::_linkToNative) {
1661     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1662     member_reg = rbx;  // known to be free at this point
1663   } else {
1664     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1665   }
1666 
1667   if (member_reg != noreg) {
1668     // Load the member_arg into register, if necessary.
1669     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1670     VMReg r = regs[member_arg_pos].first();
1671     if (r->is_stack()) {
1672       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1673     } else {
1674       // no data motion is needed
1675       member_reg = r->as_Register();
1676     }
1677   }
1678 
1679   if (has_receiver) {
1680     // Make sure the receiver is loaded into a register.
1681     assert(method->size_of_parameters() > 0, "oob");
1682     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1683     VMReg r = regs[0].first();
1684     assert(r->is_valid(), "bad receiver arg");
1685     if (r->is_stack()) {
1686       // Porting note:  This assumes that compiled calling conventions always
1687       // pass the receiver oop in a register.  If this is not true on some
1688       // platform, pick a temp and load the receiver from stack.
1689       fatal("receiver always in a register");
1690       receiver_reg = j_rarg0;  // known to be free at this point
1691       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1692     } else {
1693       // no data motion is needed
1694       receiver_reg = r->as_Register();
1695     }
1696   }
1697 
1698   // Figure out which address we are really jumping to:
1699   MethodHandles::generate_method_handle_dispatch(masm, iid,
1700                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1701 }
1702 
1703 // ---------------------------------------------------------------------------
1704 // Generate a native wrapper for a given method.  The method takes arguments
1705 // in the Java compiled code convention, marshals them to the native
1706 // convention (handlizes oops, etc), transitions to native, makes the call,
1707 // returns to java state (possibly blocking), unhandlizes any result and
1708 // returns.
1709 //
1710 // Critical native functions are a shorthand for the use of
1711 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1712 // functions.  The wrapper is expected to unpack the arguments before
1713 // passing them to the callee. Critical native functions leave the state _in_Java,
1714 // since they cannot stop for GC.
1715 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1716 // block and the check for pending exceptions it's impossible for them
1717 // to be thrown.
1718 //
1719 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1720                                                 const methodHandle& method,
1721                                                 int compile_id,
1722                                                 BasicType* in_sig_bt,
1723                                                 VMRegPair* in_regs,
1724                                                 BasicType ret_type) {
1725   if (method->is_continuation_native_intrinsic()) {
1726     int exception_offset = -1;
1727     OopMapSet* oop_maps = new OopMapSet();
1728     int frame_complete = -1;
1729     int stack_slots = -1;
1730     int interpreted_entry_offset = -1;
1731     int vep_offset = -1;
1732     if (method->is_continuation_enter_intrinsic()) {
1733       gen_continuation_enter(masm,
1734                              in_regs,
1735                              exception_offset,
1736                              oop_maps,
1737                              frame_complete,
1738                              stack_slots,
1739                              interpreted_entry_offset,
1740                              vep_offset);
1741     } else if (method->is_continuation_yield_intrinsic()) {
1742       gen_continuation_yield(masm,
1743                              in_regs,
1744                              oop_maps,
1745                              frame_complete,
1746                              stack_slots,
1747                              vep_offset);
1748     } else {
1749       guarantee(false, "Unknown Continuation native intrinsic");
1750     }
1751 
1752 #ifdef ASSERT
1753     if (method->is_continuation_enter_intrinsic()) {
1754       assert(interpreted_entry_offset != -1, "Must be set");
1755       assert(exception_offset != -1,         "Must be set");
1756     } else {
1757       assert(interpreted_entry_offset == -1, "Must be unset");
1758       assert(exception_offset == -1,         "Must be unset");
1759     }
1760     assert(frame_complete != -1,    "Must be set");
1761     assert(stack_slots != -1,       "Must be set");
1762     assert(vep_offset != -1,        "Must be set");
1763 #endif
1764 
1765     __ flush();
1766     nmethod* nm = nmethod::new_native_nmethod(method,
1767                                               compile_id,
1768                                               masm->code(),
1769                                               vep_offset,
1770                                               frame_complete,
1771                                               stack_slots,
1772                                               in_ByteSize(-1),
1773                                               in_ByteSize(-1),
1774                                               oop_maps,
1775                                               exception_offset);
1776     if (nm == nullptr) return nm;
1777     if (method->is_continuation_enter_intrinsic()) {
1778       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1779     } else if (method->is_continuation_yield_intrinsic()) {
1780       _cont_doYield_stub = nm;
1781     }
1782     return nm;
1783   }
1784 
1785   if (method->is_method_handle_intrinsic()) {
1786     vmIntrinsics::ID iid = method->intrinsic_id();
1787     intptr_t start = (intptr_t)__ pc();
1788     int vep_offset = ((intptr_t)__ pc()) - start;
1789     gen_special_dispatch(masm,
1790                          method,
1791                          in_sig_bt,
1792                          in_regs);
1793     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1794     __ flush();
1795     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1796     return nmethod::new_native_nmethod(method,
1797                                        compile_id,
1798                                        masm->code(),
1799                                        vep_offset,
1800                                        frame_complete,
1801                                        stack_slots / VMRegImpl::slots_per_word,
1802                                        in_ByteSize(-1),
1803                                        in_ByteSize(-1),
1804                                        nullptr);
1805   }
1806   address native_func = method->native_function();
1807   assert(native_func != nullptr, "must have function");
1808 
1809   // An OopMap for lock (and class if static)
1810   OopMapSet *oop_maps = new OopMapSet();
1811   intptr_t start = (intptr_t)__ pc();
1812 
1813   // We have received a description of where all the java arg are located
1814   // on entry to the wrapper. We need to convert these args to where
1815   // the jni function will expect them. To figure out where they go
1816   // we convert the java signature to a C signature by inserting
1817   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1818 
1819   const int total_in_args = method->size_of_parameters();
1820   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1821 
1822   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1823   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1824   BasicType* in_elem_bt = nullptr;
1825 
1826   int argc = 0;
1827   out_sig_bt[argc++] = T_ADDRESS;
1828   if (method->is_static()) {
1829     out_sig_bt[argc++] = T_OBJECT;
1830   }
1831 
1832   for (int i = 0; i < total_in_args ; i++ ) {
1833     out_sig_bt[argc++] = in_sig_bt[i];
1834   }
1835 
1836   // Now figure out where the args must be stored and how much stack space
1837   // they require.
1838   int out_arg_slots;
1839   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1840 
1841   // Compute framesize for the wrapper.  We need to handlize all oops in
1842   // incoming registers
1843 
1844   // Calculate the total number of stack slots we will need.
1845 
1846   // First count the abi requirement plus all of the outgoing args
1847   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1848 
1849   // Now the space for the inbound oop handle area
1850   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1851 
1852   int oop_handle_offset = stack_slots;
1853   stack_slots += total_save_slots;
1854 
1855   // Now any space we need for handlizing a klass if static method
1856 
1857   int klass_slot_offset = 0;
1858   int klass_offset = -1;
1859   int lock_slot_offset = 0;
1860   bool is_static = false;
1861 
1862   if (method->is_static()) {
1863     klass_slot_offset = stack_slots;
1864     stack_slots += VMRegImpl::slots_per_word;
1865     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1866     is_static = true;
1867   }
1868 
1869   // Plus a lock if needed
1870 
1871   if (method->is_synchronized()) {
1872     lock_slot_offset = stack_slots;
1873     stack_slots += VMRegImpl::slots_per_word;
1874   }
1875 
1876   // Now a place (+2) to save return values or temp during shuffling
1877   // + 4 for return address (which we own) and saved rbp
1878   stack_slots += 6;
1879 
1880   // Ok The space we have allocated will look like:
1881   //
1882   //
1883   // FP-> |                     |
1884   //      |---------------------|
1885   //      | 2 slots for moves   |
1886   //      |---------------------|
1887   //      | lock box (if sync)  |
1888   //      |---------------------| <- lock_slot_offset
1889   //      | klass (if static)   |
1890   //      |---------------------| <- klass_slot_offset
1891   //      | oopHandle area      |
1892   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1893   //      | outbound memory     |
1894   //      | based arguments     |
1895   //      |                     |
1896   //      |---------------------|
1897   //      |                     |
1898   // SP-> | out_preserved_slots |
1899   //
1900   //
1901 
1902 
1903   // Now compute actual number of stack words we need rounding to make
1904   // stack properly aligned.
1905   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1906 
1907   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1908 
1909   // First thing make an ic check to see if we should even be here
1910 
1911   // We are free to use all registers as temps without saving them and
1912   // restoring them except rbp. rbp is the only callee save register
1913   // as far as the interpreter and the compiler(s) are concerned.
1914 
1915   const Register receiver = j_rarg0;
1916 
1917   Label exception_pending;
1918 
1919   assert_different_registers(receiver, rscratch1, rscratch2);
1920   __ verify_oop(receiver);
1921   __ ic_check(8 /* end_alignment */);
1922 
1923   int vep_offset = ((intptr_t)__ pc()) - start;
1924 
1925   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1926     Label L_skip_barrier;
1927     Register klass = r10;
1928     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1929     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1930 
1931     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1932 
1933     __ bind(L_skip_barrier);
1934   }
1935 
1936 #ifdef COMPILER1
1937   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1938   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1939     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1940   }
1941 #endif // COMPILER1
1942 
1943   // The instruction at the verified entry point must be 5 bytes or longer
1944   // because it can be patched on the fly by make_non_entrant. The stack bang
1945   // instruction fits that requirement.
1946 
1947   // Generate stack overflow check
1948   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1949 
1950   // Generate a new frame for the wrapper.
1951   __ enter();
1952   // -2 because return address is already present and so is saved rbp
1953   __ subptr(rsp, stack_size - 2*wordSize);
1954 
1955   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1956   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1957   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1958 
1959   // Frame is now completed as far as size and linkage.
1960   int frame_complete = ((intptr_t)__ pc()) - start;
1961 
1962     if (UseRTMLocking) {
1963       // Abort RTM transaction before calling JNI
1964       // because critical section will be large and will be
1965       // aborted anyway. Also nmethod could be deoptimized.
1966       __ xabort(0);
1967     }
1968 
1969 #ifdef ASSERT
1970   __ check_stack_alignment(rsp, "improperly aligned stack");
1971 #endif /* ASSERT */
1972 
1973 
1974   // We use r14 as the oop handle for the receiver/klass
1975   // It is callee save so it survives the call to native
1976 
1977   const Register oop_handle_reg = r14;
1978 
1979   //
1980   // We immediately shuffle the arguments so that any vm call we have to
1981   // make from here on out (sync slow path, jvmti, etc.) we will have
1982   // captured the oops from our caller and have a valid oopMap for
1983   // them.
1984 
1985   // -----------------
1986   // The Grand Shuffle
1987 
1988   // The Java calling convention is either equal (linux) or denser (win64) than the
1989   // c calling convention. However the because of the jni_env argument the c calling
1990   // convention always has at least one more (and two for static) arguments than Java.
1991   // Therefore if we move the args from java -> c backwards then we will never have
1992   // a register->register conflict and we don't have to build a dependency graph
1993   // and figure out how to break any cycles.
1994   //
1995 
1996   // Record esp-based slot for receiver on stack for non-static methods
1997   int receiver_offset = -1;
1998 
1999   // This is a trick. We double the stack slots so we can claim
2000   // the oops in the caller's frame. Since we are sure to have
2001   // more args than the caller doubling is enough to make
2002   // sure we can capture all the incoming oop args from the
2003   // caller.
2004   //
2005   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2006 
2007   // Mark location of rbp (someday)
2008   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2009 
2010   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2011   // All inbound args are referenced based on rbp and all outbound args via rsp.
2012 
2013 
2014 #ifdef ASSERT
2015   bool reg_destroyed[Register::number_of_registers];
2016   bool freg_destroyed[XMMRegister::number_of_registers];
2017   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2018     reg_destroyed[r] = false;
2019   }
2020   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2021     freg_destroyed[f] = false;
2022   }
2023 
2024 #endif /* ASSERT */
2025 
2026   // For JNI natives the incoming and outgoing registers are offset upwards.
2027   GrowableArray<int> arg_order(2 * total_in_args);
2028 
2029   VMRegPair tmp_vmreg;
2030   tmp_vmreg.set2(rbx->as_VMReg());
2031 
2032   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2033     arg_order.push(i);
2034     arg_order.push(c_arg);
2035   }
2036 
2037   int temploc = -1;
2038   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2039     int i = arg_order.at(ai);
2040     int c_arg = arg_order.at(ai + 1);
2041     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2042 #ifdef ASSERT
2043     if (in_regs[i].first()->is_Register()) {
2044       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2045     } else if (in_regs[i].first()->is_XMMRegister()) {
2046       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2047     }
2048     if (out_regs[c_arg].first()->is_Register()) {
2049       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2050     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2051       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2052     }
2053 #endif /* ASSERT */
2054     switch (in_sig_bt[i]) {
2055       case T_ARRAY:
2056       case T_OBJECT:
2057         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2058                     ((i == 0) && (!is_static)),
2059                     &receiver_offset);
2060         break;
2061       case T_VOID:
2062         break;
2063 
2064       case T_FLOAT:
2065         __ float_move(in_regs[i], out_regs[c_arg]);
2066           break;
2067 
2068       case T_DOUBLE:
2069         assert( i + 1 < total_in_args &&
2070                 in_sig_bt[i + 1] == T_VOID &&
2071                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2072         __ double_move(in_regs[i], out_regs[c_arg]);
2073         break;
2074 
2075       case T_LONG :
2076         __ long_move(in_regs[i], out_regs[c_arg]);
2077         break;
2078 
2079       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2080 
2081       default:
2082         __ move32_64(in_regs[i], out_regs[c_arg]);
2083     }
2084   }
2085 
2086   int c_arg;
2087 
2088   // Pre-load a static method's oop into r14.  Used both by locking code and
2089   // the normal JNI call code.
2090   // point c_arg at the first arg that is already loaded in case we
2091   // need to spill before we call out
2092   c_arg = total_c_args - total_in_args;
2093 
2094   if (method->is_static()) {
2095 
2096     //  load oop into a register
2097     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2098 
2099     // Now handlize the static class mirror it's known not-null.
2100     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2101     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2102 
2103     // Now get the handle
2104     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2105     // store the klass handle as second argument
2106     __ movptr(c_rarg1, oop_handle_reg);
2107     // and protect the arg if we must spill
2108     c_arg--;
2109   }
2110 
2111   // Change state to native (we save the return address in the thread, since it might not
2112   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2113   // points into the right code segment. It does not have to be the correct return pc.
2114   // We use the same pc/oopMap repeatedly when we call out
2115 
2116   intptr_t the_pc = (intptr_t) __ pc();
2117   oop_maps->add_gc_map(the_pc - start, map);
2118 
2119   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2120 
2121 
2122   // We have all of the arguments setup at this point. We must not touch any register
2123   // argument registers at this point (what if we save/restore them there are no oop?
2124 
2125   {
2126     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2127     // protect the args we've loaded
2128     save_args(masm, total_c_args, c_arg, out_regs);
2129     __ mov_metadata(c_rarg1, method());
2130     __ call_VM_leaf(
2131       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2132       r15_thread, c_rarg1);
2133     restore_args(masm, total_c_args, c_arg, out_regs);
2134   }
2135 
2136   // RedefineClasses() tracing support for obsolete method entry
2137   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2138     // protect the args we've loaded
2139     save_args(masm, total_c_args, c_arg, out_regs);
2140     __ mov_metadata(c_rarg1, method());
2141     __ call_VM_leaf(
2142       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2143       r15_thread, c_rarg1);
2144     restore_args(masm, total_c_args, c_arg, out_regs);
2145   }
2146 
2147   // Lock a synchronized method
2148 
2149   // Register definitions used by locking and unlocking
2150 
2151   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2152   const Register obj_reg  = rbx;  // Will contain the oop
2153   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2154   const Register old_hdr  = r13;  // value of old header at unlock time
2155 
2156   Label slow_path_lock;
2157   Label lock_done;
2158 
2159   if (method->is_synchronized()) {
2160     Label count_mon;
2161 
2162     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2163 
2164     // Get the handle (the 2nd argument)
2165     __ mov(oop_handle_reg, c_rarg1);
2166 
2167     // Get address of the box
2168 
2169     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2170 
2171     // Load the oop from the handle
2172     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2173 
2174     if (LockingMode == LM_MONITOR) {
2175       __ jmp(slow_path_lock);
2176     } else if (LockingMode == LM_LEGACY) {
2177       // Load immediate 1 into swap_reg %rax
2178       __ movl(swap_reg, 1);
2179 
2180       // Load (object->mark() | 1) into swap_reg %rax
2181       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2182 
2183       // Save (object->mark() | 1) into BasicLock's displaced header
2184       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2185 
2186       // src -> dest iff dest == rax else rax <- dest
2187       __ lock();
2188       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2189       __ jcc(Assembler::equal, count_mon);
2190 
2191       // Hmm should this move to the slow path code area???
2192 
2193       // Test if the oopMark is an obvious stack pointer, i.e.,
2194       //  1) (mark & 3) == 0, and
2195       //  2) rsp <= mark < mark + os::pagesize()
2196       // These 3 tests can be done by evaluating the following
2197       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2198       // assuming both stack pointer and pagesize have their
2199       // least significant 2 bits clear.
2200       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2201 
2202       __ subptr(swap_reg, rsp);
2203       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2204 
2205       // Save the test result, for recursive case, the result is zero
2206       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2207       __ jcc(Assembler::notEqual, slow_path_lock);
2208     } else {
2209       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2210       __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2211     }
2212     __ bind(count_mon);
2213     __ inc_held_monitor_count();
2214 
2215     // Slow path will re-enter here
2216     __ bind(lock_done);
2217   }
2218 
2219   // Finally just about ready to make the JNI call
2220 
2221   // get JNIEnv* which is first argument to native
2222   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2223 
2224   // Now set thread in native
2225   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2226 
2227   __ call(RuntimeAddress(native_func));
2228 
2229   // Verify or restore cpu control state after JNI call
2230   __ restore_cpu_control_state_after_jni(rscratch1);
2231 
2232   // Unpack native results.
2233   switch (ret_type) {
2234   case T_BOOLEAN: __ c2bool(rax);            break;
2235   case T_CHAR   : __ movzwl(rax, rax);      break;
2236   case T_BYTE   : __ sign_extend_byte (rax); break;
2237   case T_SHORT  : __ sign_extend_short(rax); break;
2238   case T_INT    : /* nothing to do */        break;
2239   case T_DOUBLE :
2240   case T_FLOAT  :
2241     // Result is in xmm0 we'll save as needed
2242     break;
2243   case T_ARRAY:                 // Really a handle
2244   case T_OBJECT:                // Really a handle
2245       break; // can't de-handlize until after safepoint check
2246   case T_VOID: break;
2247   case T_LONG: break;
2248   default       : ShouldNotReachHere();
2249   }
2250 
2251   Label after_transition;
2252 
2253   // Switch thread to "native transition" state before reading the synchronization state.
2254   // This additional state is necessary because reading and testing the synchronization
2255   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2256   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2257   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2258   //     Thread A is resumed to finish this native method, but doesn't block here since it
2259   //     didn't see any synchronization is progress, and escapes.
2260   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2261 
2262   // Force this write out before the read below
2263   if (!UseSystemMemoryBarrier) {
2264     __ membar(Assembler::Membar_mask_bits(
2265               Assembler::LoadLoad | Assembler::LoadStore |
2266               Assembler::StoreLoad | Assembler::StoreStore));
2267   }
2268 
2269   // check for safepoint operation in progress and/or pending suspend requests
2270   {
2271     Label Continue;
2272     Label slow_path;
2273 
2274     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2275 
2276     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2277     __ jcc(Assembler::equal, Continue);
2278     __ bind(slow_path);
2279 
2280     // Don't use call_VM as it will see a possible pending exception and forward it
2281     // and never return here preventing us from clearing _last_native_pc down below.
2282     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2283     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2284     // by hand.
2285     //
2286     __ vzeroupper();
2287     save_native_result(masm, ret_type, stack_slots);
2288     __ mov(c_rarg0, r15_thread);
2289     __ mov(r12, rsp); // remember sp
2290     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2291     __ andptr(rsp, -16); // align stack as required by ABI
2292     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2293     __ mov(rsp, r12); // restore sp
2294     __ reinit_heapbase();
2295     // Restore any method result value
2296     restore_native_result(masm, ret_type, stack_slots);
2297     __ bind(Continue);
2298   }
2299 
2300   // change thread state
2301   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2302   __ bind(after_transition);
2303 
2304   Label reguard;
2305   Label reguard_done;
2306   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2307   __ jcc(Assembler::equal, reguard);
2308   __ bind(reguard_done);
2309 
2310   // native result if any is live
2311 
2312   // Unlock
2313   Label slow_path_unlock;
2314   Label unlock_done;
2315   if (method->is_synchronized()) {
2316 
2317     Label fast_done;
2318 
2319     // Get locked oop from the handle we passed to jni
2320     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2321 
2322     if (LockingMode == LM_LEGACY) {
2323       Label not_recur;
2324       // Simple recursive lock?
2325       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2326       __ jcc(Assembler::notEqual, not_recur);
2327       __ dec_held_monitor_count();
2328       __ jmpb(fast_done);
2329       __ bind(not_recur);
2330     }
2331 
2332     // Must save rax if it is live now because cmpxchg must use it
2333     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2334       save_native_result(masm, ret_type, stack_slots);
2335     }
2336 
2337     if (LockingMode == LM_MONITOR) {
2338       __ jmp(slow_path_unlock);
2339     } else if (LockingMode == LM_LEGACY) {
2340       // get address of the stack lock
2341       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2342       //  get old displaced header
2343       __ movptr(old_hdr, Address(rax, 0));
2344 
2345       // Atomic swap old header if oop still contains the stack lock
2346       __ lock();
2347       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2348       __ jcc(Assembler::notEqual, slow_path_unlock);
2349       __ dec_held_monitor_count();
2350     } else {
2351       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2352       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2353       __ dec_held_monitor_count();
2354     }
2355 
2356     // slow path re-enters here
2357     __ bind(unlock_done);
2358     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2359       restore_native_result(masm, ret_type, stack_slots);
2360     }
2361 
2362     __ bind(fast_done);
2363   }
2364   {
2365     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2366     save_native_result(masm, ret_type, stack_slots);
2367     __ mov_metadata(c_rarg1, method());
2368     __ call_VM_leaf(
2369          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2370          r15_thread, c_rarg1);
2371     restore_native_result(masm, ret_type, stack_slots);
2372   }
2373 
2374   __ reset_last_Java_frame(false);
2375 
2376   // Unbox oop result, e.g. JNIHandles::resolve value.
2377   if (is_reference_type(ret_type)) {
2378     __ resolve_jobject(rax /* value */,
2379                        r15_thread /* thread */,
2380                        rcx /* tmp */);
2381   }
2382 
2383   if (CheckJNICalls) {
2384     // clear_pending_jni_exception_check
2385     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2386   }
2387 
2388   // reset handle block
2389   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2390   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2391 
2392   // pop our frame
2393 
2394   __ leave();
2395 
2396   // Any exception pending?
2397   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2398   __ jcc(Assembler::notEqual, exception_pending);
2399 
2400   // Return
2401 
2402   __ ret(0);
2403 
2404   // Unexpected paths are out of line and go here
2405 
2406   // forward the exception
2407   __ bind(exception_pending);
2408 
2409   // and forward the exception
2410   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2411 
2412   // Slow path locking & unlocking
2413   if (method->is_synchronized()) {
2414 
2415     // BEGIN Slow path lock
2416     __ bind(slow_path_lock);
2417 
2418     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2419     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2420 
2421     // protect the args we've loaded
2422     save_args(masm, total_c_args, c_arg, out_regs);
2423 
2424     __ mov(c_rarg0, obj_reg);
2425     __ mov(c_rarg1, lock_reg);
2426     __ mov(c_rarg2, r15_thread);
2427 
2428     // Not a leaf but we have last_Java_frame setup as we want
2429     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2430     restore_args(masm, total_c_args, c_arg, out_regs);
2431 
2432 #ifdef ASSERT
2433     { Label L;
2434     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2435     __ jcc(Assembler::equal, L);
2436     __ stop("no pending exception allowed on exit from monitorenter");
2437     __ bind(L);
2438     }
2439 #endif
2440     __ jmp(lock_done);
2441 
2442     // END Slow path lock
2443 
2444     // BEGIN Slow path unlock
2445     __ bind(slow_path_unlock);
2446 
2447     // If we haven't already saved the native result we must save it now as xmm registers
2448     // are still exposed.
2449     __ vzeroupper();
2450     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2451       save_native_result(masm, ret_type, stack_slots);
2452     }
2453 
2454     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2455 
2456     __ mov(c_rarg0, obj_reg);
2457     __ mov(c_rarg2, r15_thread);
2458     __ mov(r12, rsp); // remember sp
2459     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2460     __ andptr(rsp, -16); // align stack as required by ABI
2461 
2462     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2463     // NOTE that obj_reg == rbx currently
2464     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2465     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2466 
2467     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2468     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2469     __ mov(rsp, r12); // restore sp
2470     __ reinit_heapbase();
2471 #ifdef ASSERT
2472     {
2473       Label L;
2474       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2475       __ jcc(Assembler::equal, L);
2476       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2477       __ bind(L);
2478     }
2479 #endif /* ASSERT */
2480 
2481     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2482 
2483     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2484       restore_native_result(masm, ret_type, stack_slots);
2485     }
2486     __ jmp(unlock_done);
2487 
2488     // END Slow path unlock
2489 
2490   } // synchronized
2491 
2492   // SLOW PATH Reguard the stack if needed
2493 
2494   __ bind(reguard);
2495   __ vzeroupper();
2496   save_native_result(masm, ret_type, stack_slots);
2497   __ mov(r12, rsp); // remember sp
2498   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2499   __ andptr(rsp, -16); // align stack as required by ABI
2500   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2501   __ mov(rsp, r12); // restore sp
2502   __ reinit_heapbase();
2503   restore_native_result(masm, ret_type, stack_slots);
2504   // and continue
2505   __ jmp(reguard_done);
2506 
2507 
2508 
2509   __ flush();
2510 
2511   nmethod *nm = nmethod::new_native_nmethod(method,
2512                                             compile_id,
2513                                             masm->code(),
2514                                             vep_offset,
2515                                             frame_complete,
2516                                             stack_slots / VMRegImpl::slots_per_word,
2517                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2518                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2519                                             oop_maps);
2520 
2521   return nm;
2522 }
2523 
2524 // this function returns the adjust size (in number of words) to a c2i adapter
2525 // activation for use during deoptimization
2526 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2527   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2528 }
2529 
2530 
2531 uint SharedRuntime::out_preserve_stack_slots() {
2532   return 0;
2533 }
2534 
2535 
2536 // Number of stack slots between incoming argument block and the start of
2537 // a new frame.  The PROLOG must add this many slots to the stack.  The
2538 // EPILOG must remove this many slots.  amd64 needs two slots for
2539 // return address.
2540 uint SharedRuntime::in_preserve_stack_slots() {
2541   return 4 + 2 * VerifyStackAtCalls;
2542 }
2543 
2544 //------------------------------generate_deopt_blob----------------------------
2545 void SharedRuntime::generate_deopt_blob() {
2546   // Allocate space for the code
2547   ResourceMark rm;
2548   // Setup code generation tools
2549   int pad = 0;
2550   if (UseAVX > 2) {
2551     pad += 1024;
2552   }
2553 #if INCLUDE_JVMCI
2554   if (EnableJVMCI) {
2555     pad += 512; // Increase the buffer size when compiling for JVMCI
2556   }
2557 #endif
2558   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2559   MacroAssembler* masm = new MacroAssembler(&buffer);
2560   int frame_size_in_words;
2561   OopMap* map = nullptr;
2562   OopMapSet *oop_maps = new OopMapSet();
2563 
2564   // -------------
2565   // This code enters when returning to a de-optimized nmethod.  A return
2566   // address has been pushed on the stack, and return values are in
2567   // registers.
2568   // If we are doing a normal deopt then we were called from the patched
2569   // nmethod from the point we returned to the nmethod. So the return
2570   // address on the stack is wrong by NativeCall::instruction_size
2571   // We will adjust the value so it looks like we have the original return
2572   // address on the stack (like when we eagerly deoptimized).
2573   // In the case of an exception pending when deoptimizing, we enter
2574   // with a return address on the stack that points after the call we patched
2575   // into the exception handler. We have the following register state from,
2576   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2577   //    rax: exception oop
2578   //    rbx: exception handler
2579   //    rdx: throwing pc
2580   // So in this case we simply jam rdx into the useless return address and
2581   // the stack looks just like we want.
2582   //
2583   // At this point we need to de-opt.  We save the argument return
2584   // registers.  We call the first C routine, fetch_unroll_info().  This
2585   // routine captures the return values and returns a structure which
2586   // describes the current frame size and the sizes of all replacement frames.
2587   // The current frame is compiled code and may contain many inlined
2588   // functions, each with their own JVM state.  We pop the current frame, then
2589   // push all the new frames.  Then we call the C routine unpack_frames() to
2590   // populate these frames.  Finally unpack_frames() returns us the new target
2591   // address.  Notice that callee-save registers are BLOWN here; they have
2592   // already been captured in the vframeArray at the time the return PC was
2593   // patched.
2594   address start = __ pc();
2595   Label cont;
2596 
2597   // Prolog for non exception case!
2598 
2599   // Save everything in sight.
2600   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2601 
2602   // Normal deoptimization.  Save exec mode for unpack_frames.
2603   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2604   __ jmp(cont);
2605 
2606   int reexecute_offset = __ pc() - start;
2607 #if INCLUDE_JVMCI && !defined(COMPILER1)
2608   if (EnableJVMCI && UseJVMCICompiler) {
2609     // JVMCI does not use this kind of deoptimization
2610     __ should_not_reach_here();
2611   }
2612 #endif
2613 
2614   // Reexecute case
2615   // return address is the pc describes what bci to do re-execute at
2616 
2617   // No need to update map as each call to save_live_registers will produce identical oopmap
2618   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2619 
2620   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2621   __ jmp(cont);
2622 
2623 #if INCLUDE_JVMCI
2624   Label after_fetch_unroll_info_call;
2625   int implicit_exception_uncommon_trap_offset = 0;
2626   int uncommon_trap_offset = 0;
2627 
2628   if (EnableJVMCI) {
2629     implicit_exception_uncommon_trap_offset = __ pc() - start;
2630 
2631     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2632     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2633 
2634     uncommon_trap_offset = __ pc() - start;
2635 
2636     // Save everything in sight.
2637     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2638     // fetch_unroll_info needs to call last_java_frame()
2639     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2640 
2641     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2642     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2643 
2644     __ movl(r14, Deoptimization::Unpack_reexecute);
2645     __ mov(c_rarg0, r15_thread);
2646     __ movl(c_rarg2, r14); // exec mode
2647     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2648     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2649 
2650     __ reset_last_Java_frame(false);
2651 
2652     __ jmp(after_fetch_unroll_info_call);
2653   } // EnableJVMCI
2654 #endif // INCLUDE_JVMCI
2655 
2656   int exception_offset = __ pc() - start;
2657 
2658   // Prolog for exception case
2659 
2660   // all registers are dead at this entry point, except for rax, and
2661   // rdx which contain the exception oop and exception pc
2662   // respectively.  Set them in TLS and fall thru to the
2663   // unpack_with_exception_in_tls entry point.
2664 
2665   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2666   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2667 
2668   int exception_in_tls_offset = __ pc() - start;
2669 
2670   // new implementation because exception oop is now passed in JavaThread
2671 
2672   // Prolog for exception case
2673   // All registers must be preserved because they might be used by LinearScan
2674   // Exceptiop oop and throwing PC are passed in JavaThread
2675   // tos: stack at point of call to method that threw the exception (i.e. only
2676   // args are on the stack, no return address)
2677 
2678   // make room on stack for the return address
2679   // It will be patched later with the throwing pc. The correct value is not
2680   // available now because loading it from memory would destroy registers.
2681   __ push(0);
2682 
2683   // Save everything in sight.
2684   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2685 
2686   // Now it is safe to overwrite any register
2687 
2688   // Deopt during an exception.  Save exec mode for unpack_frames.
2689   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2690 
2691   // load throwing pc from JavaThread and patch it as the return address
2692   // of the current frame. Then clear the field in JavaThread
2693 
2694   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2695   __ movptr(Address(rbp, wordSize), rdx);
2696   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2697 
2698 #ifdef ASSERT
2699   // verify that there is really an exception oop in JavaThread
2700   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2701   __ verify_oop(rax);
2702 
2703   // verify that there is no pending exception
2704   Label no_pending_exception;
2705   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2706   __ testptr(rax, rax);
2707   __ jcc(Assembler::zero, no_pending_exception);
2708   __ stop("must not have pending exception here");
2709   __ bind(no_pending_exception);
2710 #endif
2711 
2712   __ bind(cont);
2713 
2714   // Call C code.  Need thread and this frame, but NOT official VM entry
2715   // crud.  We cannot block on this call, no GC can happen.
2716   //
2717   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2718 
2719   // fetch_unroll_info needs to call last_java_frame().
2720 
2721   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2722 #ifdef ASSERT
2723   { Label L;
2724     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2725     __ jcc(Assembler::equal, L);
2726     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2727     __ bind(L);
2728   }
2729 #endif // ASSERT
2730   __ mov(c_rarg0, r15_thread);
2731   __ movl(c_rarg1, r14); // exec_mode
2732   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2733 
2734   // Need to have an oopmap that tells fetch_unroll_info where to
2735   // find any register it might need.
2736   oop_maps->add_gc_map(__ pc() - start, map);
2737 
2738   __ reset_last_Java_frame(false);
2739 
2740 #if INCLUDE_JVMCI
2741   if (EnableJVMCI) {
2742     __ bind(after_fetch_unroll_info_call);
2743   }
2744 #endif
2745 
2746   // Load UnrollBlock* into rdi
2747   __ mov(rdi, rax);
2748 
2749   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2750    Label noException;
2751   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2752   __ jcc(Assembler::notEqual, noException);
2753   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2754   // QQQ this is useless it was null above
2755   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2756   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2757   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2758 
2759   __ verify_oop(rax);
2760 
2761   // Overwrite the result registers with the exception results.
2762   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2763   // I think this is useless
2764   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2765 
2766   __ bind(noException);
2767 
2768   // Only register save data is on the stack.
2769   // Now restore the result registers.  Everything else is either dead
2770   // or captured in the vframeArray.
2771   RegisterSaver::restore_result_registers(masm);
2772 
2773   // All of the register save area has been popped of the stack. Only the
2774   // return address remains.
2775 
2776   // Pop all the frames we must move/replace.
2777   //
2778   // Frame picture (youngest to oldest)
2779   // 1: self-frame (no frame link)
2780   // 2: deopting frame  (no frame link)
2781   // 3: caller of deopting frame (could be compiled/interpreted).
2782   //
2783   // Note: by leaving the return address of self-frame on the stack
2784   // and using the size of frame 2 to adjust the stack
2785   // when we are done the return to frame 3 will still be on the stack.
2786 
2787   // Pop deoptimized frame
2788   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2789   __ addptr(rsp, rcx);
2790 
2791   // rsp should be pointing at the return address to the caller (3)
2792 
2793   // Pick up the initial fp we should save
2794   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2795   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2796 
2797 #ifdef ASSERT
2798   // Compilers generate code that bang the stack by as much as the
2799   // interpreter would need. So this stack banging should never
2800   // trigger a fault. Verify that it does not on non product builds.
2801   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2802   __ bang_stack_size(rbx, rcx);
2803 #endif
2804 
2805   // Load address of array of frame pcs into rcx
2806   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2807 
2808   // Trash the old pc
2809   __ addptr(rsp, wordSize);
2810 
2811   // Load address of array of frame sizes into rsi
2812   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2813 
2814   // Load counter into rdx
2815   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2816 
2817   // Now adjust the caller's stack to make up for the extra locals
2818   // but record the original sp so that we can save it in the skeletal interpreter
2819   // frame and the stack walking of interpreter_sender will get the unextended sp
2820   // value and not the "real" sp value.
2821 
2822   const Register sender_sp = r8;
2823 
2824   __ mov(sender_sp, rsp);
2825   __ movl(rbx, Address(rdi,
2826                        Deoptimization::UnrollBlock::
2827                        caller_adjustment_offset()));
2828   __ subptr(rsp, rbx);
2829 
2830   // Push interpreter frames in a loop
2831   Label loop;
2832   __ bind(loop);
2833   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2834   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2835   __ pushptr(Address(rcx, 0));          // Save return address
2836   __ enter();                           // Save old & set new ebp
2837   __ subptr(rsp, rbx);                  // Prolog
2838   // This value is corrected by layout_activation_impl
2839   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2840   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2841   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2842   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2843   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2844   __ decrementl(rdx);                   // Decrement counter
2845   __ jcc(Assembler::notZero, loop);
2846   __ pushptr(Address(rcx, 0));          // Save final return address
2847 
2848   // Re-push self-frame
2849   __ enter();                           // Save old & set new ebp
2850 
2851   // Allocate a full sized register save area.
2852   // Return address and rbp are in place, so we allocate two less words.
2853   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2854 
2855   // Restore frame locals after moving the frame
2856   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2857   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2858 
2859   // Call C code.  Need thread but NOT official VM entry
2860   // crud.  We cannot block on this call, no GC can happen.  Call should
2861   // restore return values to their stack-slots with the new SP.
2862   //
2863   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2864 
2865   // Use rbp because the frames look interpreted now
2866   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2867   // Don't need the precise return PC here, just precise enough to point into this code blob.
2868   address the_pc = __ pc();
2869   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2870 
2871   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2872   __ mov(c_rarg0, r15_thread);
2873   __ movl(c_rarg1, r14); // second arg: exec_mode
2874   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2875   // Revert SP alignment after call since we're going to do some SP relative addressing below
2876   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2877 
2878   // Set an oopmap for the call site
2879   // Use the same PC we used for the last java frame
2880   oop_maps->add_gc_map(the_pc - start,
2881                        new OopMap( frame_size_in_words, 0 ));
2882 
2883   // Clear fp AND pc
2884   __ reset_last_Java_frame(true);
2885 
2886   // Collect return values
2887   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2888   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2889   // I think this is useless (throwing pc?)
2890   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2891 
2892   // Pop self-frame.
2893   __ leave();                           // Epilog
2894 
2895   // Jump to interpreter
2896   __ ret(0);
2897 
2898   // Make sure all code is generated
2899   masm->flush();
2900 
2901   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2902   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2903 #if INCLUDE_JVMCI
2904   if (EnableJVMCI) {
2905     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2906     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2907   }
2908 #endif
2909 }
2910 
2911 #ifdef COMPILER2
2912 //------------------------------generate_uncommon_trap_blob--------------------
2913 void SharedRuntime::generate_uncommon_trap_blob() {
2914   // Allocate space for the code
2915   ResourceMark rm;
2916   // Setup code generation tools
2917   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2918   MacroAssembler* masm = new MacroAssembler(&buffer);
2919 
2920   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2921 
2922   address start = __ pc();
2923 
2924   if (UseRTMLocking) {
2925     // Abort RTM transaction before possible nmethod deoptimization.
2926     __ xabort(0);
2927   }
2928 
2929   // Push self-frame.  We get here with a return address on the
2930   // stack, so rsp is 8-byte aligned until we allocate our frame.
2931   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2932 
2933   // No callee saved registers. rbp is assumed implicitly saved
2934   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2935 
2936   // compiler left unloaded_class_index in j_rarg0 move to where the
2937   // runtime expects it.
2938   __ movl(c_rarg1, j_rarg0);
2939 
2940   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2941 
2942   // Call C code.  Need thread but NOT official VM entry
2943   // crud.  We cannot block on this call, no GC can happen.  Call should
2944   // capture callee-saved registers as well as return values.
2945   // Thread is in rdi already.
2946   //
2947   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2948 
2949   __ mov(c_rarg0, r15_thread);
2950   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2951   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2952 
2953   // Set an oopmap for the call site
2954   OopMapSet* oop_maps = new OopMapSet();
2955   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2956 
2957   // location of rbp is known implicitly by the frame sender code
2958 
2959   oop_maps->add_gc_map(__ pc() - start, map);
2960 
2961   __ reset_last_Java_frame(false);
2962 
2963   // Load UnrollBlock* into rdi
2964   __ mov(rdi, rax);
2965 
2966 #ifdef ASSERT
2967   { Label L;
2968     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
2969               Deoptimization::Unpack_uncommon_trap);
2970     __ jcc(Assembler::equal, L);
2971     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
2972     __ bind(L);
2973   }
2974 #endif
2975 
2976   // Pop all the frames we must move/replace.
2977   //
2978   // Frame picture (youngest to oldest)
2979   // 1: self-frame (no frame link)
2980   // 2: deopting frame  (no frame link)
2981   // 3: caller of deopting frame (could be compiled/interpreted).
2982 
2983   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2984   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2985 
2986   // Pop deoptimized frame (int)
2987   __ movl(rcx, Address(rdi,
2988                        Deoptimization::UnrollBlock::
2989                        size_of_deoptimized_frame_offset()));
2990   __ addptr(rsp, rcx);
2991 
2992   // rsp should be pointing at the return address to the caller (3)
2993 
2994   // Pick up the initial fp we should save
2995   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2996   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2997 
2998 #ifdef ASSERT
2999   // Compilers generate code that bang the stack by as much as the
3000   // interpreter would need. So this stack banging should never
3001   // trigger a fault. Verify that it does not on non product builds.
3002   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3003   __ bang_stack_size(rbx, rcx);
3004 #endif
3005 
3006   // Load address of array of frame pcs into rcx (address*)
3007   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3008 
3009   // Trash the return pc
3010   __ addptr(rsp, wordSize);
3011 
3012   // Load address of array of frame sizes into rsi (intptr_t*)
3013   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
3014 
3015   // Counter
3016   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
3017 
3018   // Now adjust the caller's stack to make up for the extra locals but
3019   // record the original sp so that we can save it in the skeletal
3020   // interpreter frame and the stack walking of interpreter_sender
3021   // will get the unextended sp value and not the "real" sp value.
3022 
3023   const Register sender_sp = r8;
3024 
3025   __ mov(sender_sp, rsp);
3026   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
3027   __ subptr(rsp, rbx);
3028 
3029   // Push interpreter frames in a loop
3030   Label loop;
3031   __ bind(loop);
3032   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3033   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3034   __ pushptr(Address(rcx, 0));     // Save return address
3035   __ enter();                      // Save old & set new rbp
3036   __ subptr(rsp, rbx);             // Prolog
3037   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3038             sender_sp);            // Make it walkable
3039   // This value is corrected by layout_activation_impl
3040   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3041   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3042   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3043   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3044   __ decrementl(rdx);              // Decrement counter
3045   __ jcc(Assembler::notZero, loop);
3046   __ pushptr(Address(rcx, 0));     // Save final return address
3047 
3048   // Re-push self-frame
3049   __ enter();                 // Save old & set new rbp
3050   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3051                               // Prolog
3052 
3053   // Use rbp because the frames look interpreted now
3054   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3055   // Don't need the precise return PC here, just precise enough to point into this code blob.
3056   address the_pc = __ pc();
3057   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3058 
3059   // Call C code.  Need thread but NOT official VM entry
3060   // crud.  We cannot block on this call, no GC can happen.  Call should
3061   // restore return values to their stack-slots with the new SP.
3062   // Thread is in rdi already.
3063   //
3064   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3065 
3066   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3067   __ mov(c_rarg0, r15_thread);
3068   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3069   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3070 
3071   // Set an oopmap for the call site
3072   // Use the same PC we used for the last java frame
3073   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3074 
3075   // Clear fp AND pc
3076   __ reset_last_Java_frame(true);
3077 
3078   // Pop self-frame.
3079   __ leave();                 // Epilog
3080 
3081   // Jump to interpreter
3082   __ ret(0);
3083 
3084   // Make sure all code is generated
3085   masm->flush();
3086 
3087   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3088                                                  SimpleRuntimeFrame::framesize >> 1);
3089 }
3090 #endif // COMPILER2
3091 
3092 //------------------------------generate_handler_blob------
3093 //
3094 // Generate a special Compile2Runtime blob that saves all registers,
3095 // and setup oopmap.
3096 //
3097 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3098   assert(StubRoutines::forward_exception_entry() != nullptr,
3099          "must be generated before");
3100 
3101   ResourceMark rm;
3102   OopMapSet *oop_maps = new OopMapSet();
3103   OopMap* map;
3104 
3105   // Allocate space for the code.  Setup code generation tools.
3106   CodeBuffer buffer("handler_blob", 2048, 1024);
3107   MacroAssembler* masm = new MacroAssembler(&buffer);
3108 
3109   address start   = __ pc();
3110   address call_pc = nullptr;
3111   int frame_size_in_words;
3112   bool cause_return = (poll_type == POLL_AT_RETURN);
3113   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3114 
3115   if (UseRTMLocking) {
3116     // Abort RTM transaction before calling runtime
3117     // because critical section will be large and will be
3118     // aborted anyway. Also nmethod could be deoptimized.
3119     __ xabort(0);
3120   }
3121 
3122   // Make room for return address (or push it again)
3123   if (!cause_return) {
3124     __ push(rbx);
3125   }
3126 
3127   // Save registers, fpu state, and flags
3128   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3129 
3130   // The following is basically a call_VM.  However, we need the precise
3131   // address of the call in order to generate an oopmap. Hence, we do all the
3132   // work ourselves.
3133 
3134   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3135 
3136   // The return address must always be correct so that frame constructor never
3137   // sees an invalid pc.
3138 
3139   if (!cause_return) {
3140     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3141     // Additionally, rbx is a callee saved register and we can look at it later to determine
3142     // if someone changed the return address for us!
3143     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3144     __ movptr(Address(rbp, wordSize), rbx);
3145   }
3146 
3147   // Do the call
3148   __ mov(c_rarg0, r15_thread);
3149   __ call(RuntimeAddress(call_ptr));
3150 
3151   // Set an oopmap for the call site.  This oopmap will map all
3152   // oop-registers and debug-info registers as callee-saved.  This
3153   // will allow deoptimization at this safepoint to find all possible
3154   // debug-info recordings, as well as let GC find all oops.
3155 
3156   oop_maps->add_gc_map( __ pc() - start, map);
3157 
3158   Label noException;
3159 
3160   __ reset_last_Java_frame(false);
3161 
3162   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3163   __ jcc(Assembler::equal, noException);
3164 
3165   // Exception pending
3166 
3167   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3168 
3169   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3170 
3171   // No exception case
3172   __ bind(noException);
3173 
3174   Label no_adjust;
3175 #ifdef ASSERT
3176   Label bail;
3177 #endif
3178   if (!cause_return) {
3179     Label no_prefix, not_special;
3180 
3181     // If our stashed return pc was modified by the runtime we avoid touching it
3182     __ cmpptr(rbx, Address(rbp, wordSize));
3183     __ jccb(Assembler::notEqual, no_adjust);
3184 
3185     // Skip over the poll instruction.
3186     // See NativeInstruction::is_safepoint_poll()
3187     // Possible encodings:
3188     //      85 00       test   %eax,(%rax)
3189     //      85 01       test   %eax,(%rcx)
3190     //      85 02       test   %eax,(%rdx)
3191     //      85 03       test   %eax,(%rbx)
3192     //      85 06       test   %eax,(%rsi)
3193     //      85 07       test   %eax,(%rdi)
3194     //
3195     //   41 85 00       test   %eax,(%r8)
3196     //   41 85 01       test   %eax,(%r9)
3197     //   41 85 02       test   %eax,(%r10)
3198     //   41 85 03       test   %eax,(%r11)
3199     //   41 85 06       test   %eax,(%r14)
3200     //   41 85 07       test   %eax,(%r15)
3201     //
3202     //      85 04 24    test   %eax,(%rsp)
3203     //   41 85 04 24    test   %eax,(%r12)
3204     //      85 45 00    test   %eax,0x0(%rbp)
3205     //   41 85 45 00    test   %eax,0x0(%r13)
3206 
3207     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3208     __ jcc(Assembler::notEqual, no_prefix);
3209     __ addptr(rbx, 1);
3210     __ bind(no_prefix);
3211 #ifdef ASSERT
3212     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3213 #endif
3214     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3215     // r12/rsp 0x04
3216     // r13/rbp 0x05
3217     __ movzbq(rcx, Address(rbx, 1));
3218     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3219     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3220     __ cmpptr(rcx, 1);
3221     __ jcc(Assembler::above, not_special);
3222     __ addptr(rbx, 1);
3223     __ bind(not_special);
3224 #ifdef ASSERT
3225     // Verify the correct encoding of the poll we're about to skip.
3226     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3227     __ jcc(Assembler::notEqual, bail);
3228     // Mask out the modrm bits
3229     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3230     // rax encodes to 0, so if the bits are nonzero it's incorrect
3231     __ jcc(Assembler::notZero, bail);
3232 #endif
3233     // Adjust return pc forward to step over the safepoint poll instruction
3234     __ addptr(rbx, 2);
3235     __ movptr(Address(rbp, wordSize), rbx);
3236   }
3237 
3238   __ bind(no_adjust);
3239   // Normal exit, restore registers and exit.
3240   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3241   __ ret(0);
3242 
3243 #ifdef ASSERT
3244   __ bind(bail);
3245   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3246 #endif
3247 
3248   // Make sure all code is generated
3249   masm->flush();
3250 
3251   // Fill-out other meta info
3252   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3253 }
3254 
3255 //
3256 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3257 //
3258 // Generate a stub that calls into vm to find out the proper destination
3259 // of a java call. All the argument registers are live at this point
3260 // but since this is generic code we don't know what they are and the caller
3261 // must do any gc of the args.
3262 //
3263 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3264   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3265 
3266   // allocate space for the code
3267   ResourceMark rm;
3268 
3269   CodeBuffer buffer(name, 1200, 512);
3270   MacroAssembler* masm = new MacroAssembler(&buffer);
3271 
3272   int frame_size_in_words;
3273 
3274   OopMapSet *oop_maps = new OopMapSet();
3275   OopMap* map = nullptr;
3276 
3277   int start = __ offset();
3278 
3279   // No need to save vector registers since they are caller-saved anyway.
3280   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3281 
3282   int frame_complete = __ offset();
3283 
3284   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3285 
3286   __ mov(c_rarg0, r15_thread);
3287 
3288   __ call(RuntimeAddress(destination));
3289 
3290 
3291   // Set an oopmap for the call site.
3292   // We need this not only for callee-saved registers, but also for volatile
3293   // registers that the compiler might be keeping live across a safepoint.
3294 
3295   oop_maps->add_gc_map( __ offset() - start, map);
3296 
3297   // rax contains the address we are going to jump to assuming no exception got installed
3298 
3299   // clear last_Java_sp
3300   __ reset_last_Java_frame(false);
3301   // check for pending exceptions
3302   Label pending;
3303   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3304   __ jcc(Assembler::notEqual, pending);
3305 
3306   // get the returned Method*
3307   __ get_vm_result_2(rbx, r15_thread);
3308   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3309 
3310   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3311 
3312   RegisterSaver::restore_live_registers(masm);
3313 
3314   // We are back to the original state on entry and ready to go.
3315 
3316   __ jmp(rax);
3317 
3318   // Pending exception after the safepoint
3319 
3320   __ bind(pending);
3321 
3322   RegisterSaver::restore_live_registers(masm);
3323 
3324   // exception pending => remove activation and forward to exception handler
3325 
3326   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3327 
3328   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3329   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3330 
3331   // -------------
3332   // make sure all code is generated
3333   masm->flush();
3334 
3335   // return the  blob
3336   // frame_size_words or bytes??
3337   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3338 }
3339 
3340 //------------------------------Montgomery multiplication------------------------
3341 //
3342 
3343 #ifndef _WINDOWS
3344 
3345 // Subtract 0:b from carry:a.  Return carry.
3346 static julong
3347 sub(julong a[], julong b[], julong carry, long len) {
3348   long long i = 0, cnt = len;
3349   julong tmp;
3350   asm volatile("clc; "
3351                "0: ; "
3352                "mov (%[b], %[i], 8), %[tmp]; "
3353                "sbb %[tmp], (%[a], %[i], 8); "
3354                "inc %[i]; dec %[cnt]; "
3355                "jne 0b; "
3356                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3357                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3358                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3359                : "memory");
3360   return tmp;
3361 }
3362 
3363 // Multiply (unsigned) Long A by Long B, accumulating the double-
3364 // length result into the accumulator formed of T0, T1, and T2.
3365 #define MACC(A, B, T0, T1, T2)                                  \
3366 do {                                                            \
3367   unsigned long hi, lo;                                         \
3368   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3369            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3370            : "r"(A), "a"(B) : "cc");                            \
3371  } while(0)
3372 
3373 // As above, but add twice the double-length result into the
3374 // accumulator.
3375 #define MACC2(A, B, T0, T1, T2)                                 \
3376 do {                                                            \
3377   unsigned long hi, lo;                                         \
3378   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3379            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3380            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3381            : "r"(A), "a"(B) : "cc");                            \
3382  } while(0)
3383 
3384 #else //_WINDOWS
3385 
3386 static julong
3387 sub(julong a[], julong b[], julong carry, long len) {
3388   long i;
3389   julong tmp;
3390   unsigned char c = 1;
3391   for (i = 0; i < len; i++) {
3392     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3393     a[i] = tmp;
3394   }
3395   c = _addcarry_u64(c, carry, ~0, &tmp);
3396   return tmp;
3397 }
3398 
3399 // Multiply (unsigned) Long A by Long B, accumulating the double-
3400 // length result into the accumulator formed of T0, T1, and T2.
3401 #define MACC(A, B, T0, T1, T2)                          \
3402 do {                                                    \
3403   julong hi, lo;                            \
3404   lo = _umul128(A, B, &hi);                             \
3405   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3406   c = _addcarry_u64(c, hi, T1, &T1);                    \
3407   _addcarry_u64(c, T2, 0, &T2);                         \
3408  } while(0)
3409 
3410 // As above, but add twice the double-length result into the
3411 // accumulator.
3412 #define MACC2(A, B, T0, T1, T2)                         \
3413 do {                                                    \
3414   julong hi, lo;                            \
3415   lo = _umul128(A, B, &hi);                             \
3416   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3417   c = _addcarry_u64(c, hi, T1, &T1);                    \
3418   _addcarry_u64(c, T2, 0, &T2);                         \
3419   c = _addcarry_u64(0, lo, T0, &T0);                    \
3420   c = _addcarry_u64(c, hi, T1, &T1);                    \
3421   _addcarry_u64(c, T2, 0, &T2);                         \
3422  } while(0)
3423 
3424 #endif //_WINDOWS
3425 
3426 // Fast Montgomery multiplication.  The derivation of the algorithm is
3427 // in  A Cryptographic Library for the Motorola DSP56000,
3428 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3429 
3430 static void NOINLINE
3431 montgomery_multiply(julong a[], julong b[], julong n[],
3432                     julong m[], julong inv, int len) {
3433   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3434   int i;
3435 
3436   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3437 
3438   for (i = 0; i < len; i++) {
3439     int j;
3440     for (j = 0; j < i; j++) {
3441       MACC(a[j], b[i-j], t0, t1, t2);
3442       MACC(m[j], n[i-j], t0, t1, t2);
3443     }
3444     MACC(a[i], b[0], t0, t1, t2);
3445     m[i] = t0 * inv;
3446     MACC(m[i], n[0], t0, t1, t2);
3447 
3448     assert(t0 == 0, "broken Montgomery multiply");
3449 
3450     t0 = t1; t1 = t2; t2 = 0;
3451   }
3452 
3453   for (i = len; i < 2*len; i++) {
3454     int j;
3455     for (j = i-len+1; j < len; j++) {
3456       MACC(a[j], b[i-j], t0, t1, t2);
3457       MACC(m[j], n[i-j], t0, t1, t2);
3458     }
3459     m[i-len] = t0;
3460     t0 = t1; t1 = t2; t2 = 0;
3461   }
3462 
3463   while (t0)
3464     t0 = sub(m, n, t0, len);
3465 }
3466 
3467 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3468 // multiplies so it should be up to 25% faster than Montgomery
3469 // multiplication.  However, its loop control is more complex and it
3470 // may actually run slower on some machines.
3471 
3472 static void NOINLINE
3473 montgomery_square(julong a[], julong n[],
3474                   julong m[], julong inv, int len) {
3475   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3476   int i;
3477 
3478   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3479 
3480   for (i = 0; i < len; i++) {
3481     int j;
3482     int end = (i+1)/2;
3483     for (j = 0; j < end; j++) {
3484       MACC2(a[j], a[i-j], t0, t1, t2);
3485       MACC(m[j], n[i-j], t0, t1, t2);
3486     }
3487     if ((i & 1) == 0) {
3488       MACC(a[j], a[j], t0, t1, t2);
3489     }
3490     for (; j < i; j++) {
3491       MACC(m[j], n[i-j], t0, t1, t2);
3492     }
3493     m[i] = t0 * inv;
3494     MACC(m[i], n[0], t0, t1, t2);
3495 
3496     assert(t0 == 0, "broken Montgomery square");
3497 
3498     t0 = t1; t1 = t2; t2 = 0;
3499   }
3500 
3501   for (i = len; i < 2*len; i++) {
3502     int start = i-len+1;
3503     int end = start + (len - start)/2;
3504     int j;
3505     for (j = start; j < end; j++) {
3506       MACC2(a[j], a[i-j], t0, t1, t2);
3507       MACC(m[j], n[i-j], t0, t1, t2);
3508     }
3509     if ((i & 1) == 0) {
3510       MACC(a[j], a[j], t0, t1, t2);
3511     }
3512     for (; j < len; j++) {
3513       MACC(m[j], n[i-j], t0, t1, t2);
3514     }
3515     m[i-len] = t0;
3516     t0 = t1; t1 = t2; t2 = 0;
3517   }
3518 
3519   while (t0)
3520     t0 = sub(m, n, t0, len);
3521 }
3522 
3523 // Swap words in a longword.
3524 static julong swap(julong x) {
3525   return (x << 32) | (x >> 32);
3526 }
3527 
3528 // Copy len longwords from s to d, word-swapping as we go.  The
3529 // destination array is reversed.
3530 static void reverse_words(julong *s, julong *d, int len) {
3531   d += len;
3532   while(len-- > 0) {
3533     d--;
3534     *d = swap(*s);
3535     s++;
3536   }
3537 }
3538 
3539 // The threshold at which squaring is advantageous was determined
3540 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3541 #define MONTGOMERY_SQUARING_THRESHOLD 64
3542 
3543 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3544                                         jint len, jlong inv,
3545                                         jint *m_ints) {
3546   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3547   int longwords = len/2;
3548 
3549   // Make very sure we don't use so much space that the stack might
3550   // overflow.  512 jints corresponds to an 16384-bit integer and
3551   // will use here a total of 8k bytes of stack space.
3552   int divisor = sizeof(julong) * 4;
3553   guarantee(longwords <= 8192 / divisor, "must be");
3554   int total_allocation = longwords * sizeof (julong) * 4;
3555   julong *scratch = (julong *)alloca(total_allocation);
3556 
3557   // Local scratch arrays
3558   julong
3559     *a = scratch + 0 * longwords,
3560     *b = scratch + 1 * longwords,
3561     *n = scratch + 2 * longwords,
3562     *m = scratch + 3 * longwords;
3563 
3564   reverse_words((julong *)a_ints, a, longwords);
3565   reverse_words((julong *)b_ints, b, longwords);
3566   reverse_words((julong *)n_ints, n, longwords);
3567 
3568   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3569 
3570   reverse_words(m, (julong *)m_ints, longwords);
3571 }
3572 
3573 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3574                                       jint len, jlong inv,
3575                                       jint *m_ints) {
3576   assert(len % 2 == 0, "array length in montgomery_square must be even");
3577   int longwords = len/2;
3578 
3579   // Make very sure we don't use so much space that the stack might
3580   // overflow.  512 jints corresponds to an 16384-bit integer and
3581   // will use here a total of 6k bytes of stack space.
3582   int divisor = sizeof(julong) * 3;
3583   guarantee(longwords <= (8192 / divisor), "must be");
3584   int total_allocation = longwords * sizeof (julong) * 3;
3585   julong *scratch = (julong *)alloca(total_allocation);
3586 
3587   // Local scratch arrays
3588   julong
3589     *a = scratch + 0 * longwords,
3590     *n = scratch + 1 * longwords,
3591     *m = scratch + 2 * longwords;
3592 
3593   reverse_words((julong *)a_ints, a, longwords);
3594   reverse_words((julong *)n_ints, n, longwords);
3595 
3596   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3597     ::montgomery_square(a, n, m, (julong)inv, longwords);
3598   } else {
3599     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3600   }
3601 
3602   reverse_words(m, (julong *)m_ints, longwords);
3603 }
3604 
3605 #ifdef COMPILER2
3606 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3607 //
3608 //------------------------------generate_exception_blob---------------------------
3609 // creates exception blob at the end
3610 // Using exception blob, this code is jumped from a compiled method.
3611 // (see emit_exception_handler in x86_64.ad file)
3612 //
3613 // Given an exception pc at a call we call into the runtime for the
3614 // handler in this method. This handler might merely restore state
3615 // (i.e. callee save registers) unwind the frame and jump to the
3616 // exception handler for the nmethod if there is no Java level handler
3617 // for the nmethod.
3618 //
3619 // This code is entered with a jmp.
3620 //
3621 // Arguments:
3622 //   rax: exception oop
3623 //   rdx: exception pc
3624 //
3625 // Results:
3626 //   rax: exception oop
3627 //   rdx: exception pc in caller or ???
3628 //   destination: exception handler of caller
3629 //
3630 // Note: the exception pc MUST be at a call (precise debug information)
3631 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3632 //
3633 
3634 void OptoRuntime::generate_exception_blob() {
3635   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3636   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3637   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3638 
3639   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3640 
3641   // Allocate space for the code
3642   ResourceMark rm;
3643   // Setup code generation tools
3644   CodeBuffer buffer("exception_blob", 2048, 1024);
3645   MacroAssembler* masm = new MacroAssembler(&buffer);
3646 
3647 
3648   address start = __ pc();
3649 
3650   // Exception pc is 'return address' for stack walker
3651   __ push(rdx);
3652   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3653 
3654   // Save callee-saved registers.  See x86_64.ad.
3655 
3656   // rbp is an implicitly saved callee saved register (i.e., the calling
3657   // convention will save/restore it in the prolog/epilog). Other than that
3658   // there are no callee save registers now that adapter frames are gone.
3659 
3660   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3661 
3662   // Store exception in Thread object. We cannot pass any arguments to the
3663   // handle_exception call, since we do not want to make any assumption
3664   // about the size of the frame where the exception happened in.
3665   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3666   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3667   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3668 
3669   // This call does all the hard work.  It checks if an exception handler
3670   // exists in the method.
3671   // If so, it returns the handler address.
3672   // If not, it prepares for stack-unwinding, restoring the callee-save
3673   // registers of the frame being removed.
3674   //
3675   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3676 
3677   // At a method handle call, the stack may not be properly aligned
3678   // when returning with an exception.
3679   address the_pc = __ pc();
3680   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3681   __ mov(c_rarg0, r15_thread);
3682   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3683   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3684 
3685   // Set an oopmap for the call site.  This oopmap will only be used if we
3686   // are unwinding the stack.  Hence, all locations will be dead.
3687   // Callee-saved registers will be the same as the frame above (i.e.,
3688   // handle_exception_stub), since they were restored when we got the
3689   // exception.
3690 
3691   OopMapSet* oop_maps = new OopMapSet();
3692 
3693   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3694 
3695   __ reset_last_Java_frame(false);
3696 
3697   // Restore callee-saved registers
3698 
3699   // rbp is an implicitly saved callee-saved register (i.e., the calling
3700   // convention will save restore it in prolog/epilog) Other than that
3701   // there are no callee save registers now that adapter frames are gone.
3702 
3703   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3704 
3705   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3706   __ pop(rdx);                  // No need for exception pc anymore
3707 
3708   // rax: exception handler
3709 
3710   // We have a handler in rax (could be deopt blob).
3711   __ mov(r8, rax);
3712 
3713   // Get the exception oop
3714   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3715   // Get the exception pc in case we are deoptimized
3716   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3717 #ifdef ASSERT
3718   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3719   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3720 #endif
3721   // Clear the exception oop so GC no longer processes it as a root.
3722   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3723 
3724   // rax: exception oop
3725   // r8:  exception handler
3726   // rdx: exception pc
3727   // Jump to handler
3728 
3729   __ jmp(r8);
3730 
3731   // Make sure all code is generated
3732   masm->flush();
3733 
3734   // Set exception blob
3735   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3736 }
3737 #endif // COMPILER2