1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/vframeArray.hpp"
  56 #include "runtime/vm_version.hpp"
  57 #include "utilities/align.hpp"
  58 #include "utilities/checkedCast.hpp"
  59 #include "utilities/formatBuffer.hpp"
  60 #include "vmreg_x86.inline.hpp"
  61 #ifdef COMPILER1
  62 #include "c1/c1_Runtime1.hpp"
  63 #endif
  64 #ifdef COMPILER2
  65 #include "opto/runtime.hpp"
  66 #endif
  67 #if INCLUDE_JVMCI
  68 #include "jvmci/jvmciJavaClasses.hpp"
  69 #endif
  70 
  71 #define __ masm->
  72 
  73 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  74 
  75 class SimpleRuntimeFrame {
  76 
  77   public:
  78 
  79   // Most of the runtime stubs have this simple frame layout.
  80   // This class exists to make the layout shared in one place.
  81   // Offsets are for compiler stack slots, which are jints.
  82   enum layout {
  83     // The frame sender code expects that rbp will be in the "natural" place and
  84     // will override any oopMap setting for it. We must therefore force the layout
  85     // so that it agrees with the frame sender code.
  86     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  87     rbp_off2,
  88     return_off, return_off2,
  89     framesize
  90   };
  91 };
  92 
  93 class RegisterSaver {
  94   // Capture info about frame layout.  Layout offsets are in jint
  95   // units because compiler frame slots are jints.
  96 #define XSAVE_AREA_BEGIN 160
  97 #define XSAVE_AREA_YMM_BEGIN 576
  98 #define XSAVE_AREA_OPMASK_BEGIN 1088
  99 #define XSAVE_AREA_ZMM_BEGIN 1152
 100 #define XSAVE_AREA_UPPERBANK 1664
 101 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 102 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 103 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 104 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 105 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 106   enum layout {
 107     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 108     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 109     DEF_XMM_OFFS(0),
 110     DEF_XMM_OFFS(1),
 111     // 2..15 are implied in range usage
 112     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 113     DEF_YMM_OFFS(0),
 114     DEF_YMM_OFFS(1),
 115     // 2..15 are implied in range usage
 116     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 117     DEF_OPMASK_OFFS(0),
 118     DEF_OPMASK_OFFS(1),
 119     // 2..7 are implied in range usage
 120     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 121     DEF_ZMM_OFFS(0),
 122     DEF_ZMM_OFFS(1),
 123     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 124     DEF_ZMM_UPPER_OFFS(16),
 125     DEF_ZMM_UPPER_OFFS(17),
 126     // 18..31 are implied in range usage
 127     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 128     fpu_stateH_end,
 129     r15_off, r15H_off,
 130     r14_off, r14H_off,
 131     r13_off, r13H_off,
 132     r12_off, r12H_off,
 133     r11_off, r11H_off,
 134     r10_off, r10H_off,
 135     r9_off,  r9H_off,
 136     r8_off,  r8H_off,
 137     rdi_off, rdiH_off,
 138     rsi_off, rsiH_off,
 139     ignore_off, ignoreH_off,  // extra copy of rbp
 140     rsp_off, rspH_off,
 141     rbx_off, rbxH_off,
 142     rdx_off, rdxH_off,
 143     rcx_off, rcxH_off,
 144     rax_off, raxH_off,
 145     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 146     align_off, alignH_off,
 147     flags_off, flagsH_off,
 148     // The frame sender code expects that rbp will be in the "natural" place and
 149     // will override any oopMap setting for it. We must therefore force the layout
 150     // so that it agrees with the frame sender code.
 151     rbp_off, rbpH_off,        // copy of rbp we will restore
 152     return_off, returnH_off,  // slot for return address
 153     reg_save_size             // size in compiler stack slots
 154   };
 155 
 156  public:
 157   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 158   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 159 
 160   // Offsets into the register save area
 161   // Used by deoptimization when it is managing result register
 162   // values on its own
 163 
 164   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 165   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 166   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 167   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 168   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 169   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 170 
 171   // During deoptimization only the result registers need to be restored,
 172   // all the other values have already been extracted.
 173   static void restore_result_registers(MacroAssembler* masm);
 174 };
 175 
 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 177   int off = 0;
 178   int num_xmm_regs = XMMRegister::available_xmm_registers();
 179 #if COMPILER2_OR_JVMCI
 180   if (save_wide_vectors && UseAVX == 0) {
 181     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 182   }
 183   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 184 #else
 185   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 186 #endif
 187 
 188   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 189   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 190   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 191   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 192   // CodeBlob frame size is in words.
 193   int frame_size_in_words = frame_size_in_bytes / wordSize;
 194   *total_frame_words = frame_size_in_words;
 195 
 196   // Save registers, fpu state, and flags.
 197   // We assume caller has already pushed the return address onto the
 198   // stack, so rsp is 8-byte aligned here.
 199   // We push rpb twice in this sequence because we want the real rbp
 200   // to be under the return like a normal enter.
 201 
 202   __ enter();          // rsp becomes 16-byte aligned here
 203   __ push_CPU_state(); // Push a multiple of 16 bytes
 204 
 205   // push cpu state handles this on EVEX enabled targets
 206   if (save_wide_vectors) {
 207     // Save upper half of YMM registers(0..15)
 208     int base_addr = XSAVE_AREA_YMM_BEGIN;
 209     for (int n = 0; n < 16; n++) {
 210       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 211     }
 212     if (VM_Version::supports_evex()) {
 213       // Save upper half of ZMM registers(0..15)
 214       base_addr = XSAVE_AREA_ZMM_BEGIN;
 215       for (int n = 0; n < 16; n++) {
 216         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 217       }
 218       // Save full ZMM registers(16..num_xmm_regs)
 219       base_addr = XSAVE_AREA_UPPERBANK;
 220       off = 0;
 221       int vector_len = Assembler::AVX_512bit;
 222       for (int n = 16; n < num_xmm_regs; n++) {
 223         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 224       }
 225 #if COMPILER2_OR_JVMCI
 226       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 227       off = 0;
 228       for(int n = 0; n < KRegister::number_of_registers; n++) {
 229         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 230       }
 231 #endif
 232     }
 233   } else {
 234     if (VM_Version::supports_evex()) {
 235       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 236       int base_addr = XSAVE_AREA_UPPERBANK;
 237       off = 0;
 238       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 239       for (int n = 16; n < num_xmm_regs; n++) {
 240         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 241       }
 242 #if COMPILER2_OR_JVMCI
 243       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 244       off = 0;
 245       for(int n = 0; n < KRegister::number_of_registers; n++) {
 246         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 247       }
 248 #endif
 249     }
 250   }
 251   __ vzeroupper();
 252   if (frame::arg_reg_save_area_bytes != 0) {
 253     // Allocate argument register save area
 254     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 255   }
 256 
 257   // Set an oopmap for the call site.  This oopmap will map all
 258   // oop-registers and debug-info registers as callee-saved.  This
 259   // will allow deoptimization at this safepoint to find all possible
 260   // debug-info recordings, as well as let GC find all oops.
 261 
 262   OopMapSet *oop_maps = new OopMapSet();
 263   OopMap* map = new OopMap(frame_size_in_slots, 0);
 264 
 265 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 266 
 267   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 271   // rbp location is known implicitly by the frame sender code, needs no oopmap
 272   // and the location where rbp was saved by is ignored
 273   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 283   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 284   // on EVEX enabled targets, we get it included in the xsave area
 285   off = xmm0_off;
 286   int delta = xmm1_off - off;
 287   for (int n = 0; n < 16; n++) {
 288     XMMRegister xmm_name = as_XMMRegister(n);
 289     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 290     off += delta;
 291   }
 292   if (UseAVX > 2) {
 293     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 294     off = zmm16_off;
 295     delta = zmm17_off - off;
 296     for (int n = 16; n < num_xmm_regs; n++) {
 297       XMMRegister zmm_name = as_XMMRegister(n);
 298       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 299       off += delta;
 300     }
 301   }
 302 
 303 #if COMPILER2_OR_JVMCI
 304   if (save_wide_vectors) {
 305     // Save upper half of YMM registers(0..15)
 306     off = ymm0_off;
 307     delta = ymm1_off - ymm0_off;
 308     for (int n = 0; n < 16; n++) {
 309       XMMRegister ymm_name = as_XMMRegister(n);
 310       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 311       off += delta;
 312     }
 313     if (VM_Version::supports_evex()) {
 314       // Save upper half of ZMM registers(0..15)
 315       off = zmm0_off;
 316       delta = zmm1_off - zmm0_off;
 317       for (int n = 0; n < 16; n++) {
 318         XMMRegister zmm_name = as_XMMRegister(n);
 319         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 320         off += delta;
 321       }
 322     }
 323   }
 324 #endif // COMPILER2_OR_JVMCI
 325 
 326   // %%% These should all be a waste but we'll keep things as they were for now
 327   if (true) {
 328     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 332     // rbp location is known implicitly by the frame sender code, needs no oopmap
 333     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 343     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 344     // on EVEX enabled targets, we get it included in the xsave area
 345     off = xmm0H_off;
 346     delta = xmm1H_off - off;
 347     for (int n = 0; n < 16; n++) {
 348       XMMRegister xmm_name = as_XMMRegister(n);
 349       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 350       off += delta;
 351     }
 352     if (UseAVX > 2) {
 353       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 354       off = zmm16H_off;
 355       delta = zmm17H_off - off;
 356       for (int n = 16; n < num_xmm_regs; n++) {
 357         XMMRegister zmm_name = as_XMMRegister(n);
 358         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 359         off += delta;
 360       }
 361     }
 362   }
 363 
 364   return map;
 365 }
 366 
 367 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 368   int num_xmm_regs = XMMRegister::available_xmm_registers();
 369   if (frame::arg_reg_save_area_bytes != 0) {
 370     // Pop arg register save area
 371     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 372   }
 373 
 374 #if COMPILER2_OR_JVMCI
 375   if (restore_wide_vectors) {
 376     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 377     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 378   }
 379 #else
 380   assert(!restore_wide_vectors, "vectors are generated only by C2");
 381 #endif
 382 
 383   __ vzeroupper();
 384 
 385   // On EVEX enabled targets everything is handled in pop fpu state
 386   if (restore_wide_vectors) {
 387     // Restore upper half of YMM registers (0..15)
 388     int base_addr = XSAVE_AREA_YMM_BEGIN;
 389     for (int n = 0; n < 16; n++) {
 390       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 391     }
 392     if (VM_Version::supports_evex()) {
 393       // Restore upper half of ZMM registers (0..15)
 394       base_addr = XSAVE_AREA_ZMM_BEGIN;
 395       for (int n = 0; n < 16; n++) {
 396         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 397       }
 398       // Restore full ZMM registers(16..num_xmm_regs)
 399       base_addr = XSAVE_AREA_UPPERBANK;
 400       int vector_len = Assembler::AVX_512bit;
 401       int off = 0;
 402       for (int n = 16; n < num_xmm_regs; n++) {
 403         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 404       }
 405 #if COMPILER2_OR_JVMCI
 406       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 407       off = 0;
 408       for (int n = 0; n < KRegister::number_of_registers; n++) {
 409         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 410       }
 411 #endif
 412     }
 413   } else {
 414     if (VM_Version::supports_evex()) {
 415       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 416       int base_addr = XSAVE_AREA_UPPERBANK;
 417       int off = 0;
 418       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 419       for (int n = 16; n < num_xmm_regs; n++) {
 420         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 421       }
 422 #if COMPILER2_OR_JVMCI
 423       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 424       off = 0;
 425       for (int n = 0; n < KRegister::number_of_registers; n++) {
 426         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 427       }
 428 #endif
 429     }
 430   }
 431 
 432   // Recover CPU state
 433   __ pop_CPU_state();
 434   // Get the rbp described implicitly by the calling convention (no oopMap)
 435   __ pop(rbp);
 436 }
 437 
 438 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 439 
 440   // Just restore result register. Only used by deoptimization. By
 441   // now any callee save register that needs to be restored to a c2
 442   // caller of the deoptee has been extracted into the vframeArray
 443   // and will be stuffed into the c2i adapter we create for later
 444   // restoration so only result registers need to be restored here.
 445 
 446   // Restore fp result register
 447   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 448   // Restore integer result register
 449   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 450   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 451 
 452   // Pop all of the register save are off the stack except the return address
 453   __ addptr(rsp, return_offset_in_bytes());
 454 }
 455 
 456 // Is vector's size (in bytes) bigger than a size saved by default?
 457 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 458 bool SharedRuntime::is_wide_vector(int size) {
 459   return size > 16;
 460 }
 461 
 462 // ---------------------------------------------------------------------------
 463 // Read the array of BasicTypes from a signature, and compute where the
 464 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 465 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 466 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 467 // as framesizes are fixed.
 468 // VMRegImpl::stack0 refers to the first slot 0(sp).
 469 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 470 // Register up to Register::number_of_registers are the 64-bit
 471 // integer registers.
 472 
 473 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 474 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 475 // units regardless of build. Of course for i486 there is no 64 bit build
 476 
 477 // The Java calling convention is a "shifted" version of the C ABI.
 478 // By skipping the first C ABI register we can call non-static jni methods
 479 // with small numbers of arguments without having to shuffle the arguments
 480 // at all. Since we control the java ABI we ought to at least get some
 481 // advantage out of it.
 482 
 483 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 484                                            VMRegPair *regs,
 485                                            int total_args_passed) {
 486 
 487   // Create the mapping between argument positions and
 488   // registers.
 489   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 490     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 491   };
 492   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 493     j_farg0, j_farg1, j_farg2, j_farg3,
 494     j_farg4, j_farg5, j_farg6, j_farg7
 495   };
 496 
 497 
 498   uint int_args = 0;
 499   uint fp_args = 0;
 500   uint stk_args = 0;
 501 
 502   for (int i = 0; i < total_args_passed; i++) {
 503     switch (sig_bt[i]) {
 504     case T_BOOLEAN:
 505     case T_CHAR:
 506     case T_BYTE:
 507     case T_SHORT:
 508     case T_INT:
 509       if (int_args < Argument::n_int_register_parameters_j) {
 510         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 511       } else {
 512         stk_args = align_up(stk_args, 2);
 513         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 514         stk_args += 1;
 515       }
 516       break;
 517     case T_VOID:
 518       // halves of T_LONG or T_DOUBLE
 519       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 520       regs[i].set_bad();
 521       break;
 522     case T_LONG:
 523       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 524       // fall through
 525     case T_OBJECT:
 526     case T_ARRAY:
 527     case T_ADDRESS:
 528       if (int_args < Argument::n_int_register_parameters_j) {
 529         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 530       } else {
 531         stk_args = align_up(stk_args, 2);
 532         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 533         stk_args += 2;
 534       }
 535       break;
 536     case T_FLOAT:
 537       if (fp_args < Argument::n_float_register_parameters_j) {
 538         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 539       } else {
 540         stk_args = align_up(stk_args, 2);
 541         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 542         stk_args += 1;
 543       }
 544       break;
 545     case T_DOUBLE:
 546       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 547       if (fp_args < Argument::n_float_register_parameters_j) {
 548         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 549       } else {
 550         stk_args = align_up(stk_args, 2);
 551         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 552         stk_args += 2;
 553       }
 554       break;
 555     default:
 556       ShouldNotReachHere();
 557       break;
 558     }
 559   }
 560 
 561   return stk_args;
 562 }
 563 
 564 // Patch the callers callsite with entry to compiled code if it exists.
 565 static void patch_callers_callsite(MacroAssembler *masm) {
 566   Label L;
 567   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 568   __ jcc(Assembler::equal, L);
 569 
 570   // Save the current stack pointer
 571   __ mov(r13, rsp);
 572   // Schedule the branch target address early.
 573   // Call into the VM to patch the caller, then jump to compiled callee
 574   // rax isn't live so capture return address while we easily can
 575   __ movptr(rax, Address(rsp, 0));
 576 
 577   // align stack so push_CPU_state doesn't fault
 578   __ andptr(rsp, -(StackAlignmentInBytes));
 579   __ push_CPU_state();
 580   __ vzeroupper();
 581   // VM needs caller's callsite
 582   // VM needs target method
 583   // This needs to be a long call since we will relocate this adapter to
 584   // the codeBuffer and it may not reach
 585 
 586   // Allocate argument register save area
 587   if (frame::arg_reg_save_area_bytes != 0) {
 588     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 589   }
 590   __ mov(c_rarg0, rbx);
 591   __ mov(c_rarg1, rax);
 592   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 593 
 594   // De-allocate argument register save area
 595   if (frame::arg_reg_save_area_bytes != 0) {
 596     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 597   }
 598 
 599   __ vzeroupper();
 600   __ pop_CPU_state();
 601   // restore sp
 602   __ mov(rsp, r13);
 603   __ bind(L);
 604 }
 605 
 606 
 607 static void gen_c2i_adapter(MacroAssembler *masm,
 608                             int total_args_passed,
 609                             int comp_args_on_stack,
 610                             const BasicType *sig_bt,
 611                             const VMRegPair *regs,
 612                             Label& skip_fixup) {
 613   // Before we get into the guts of the C2I adapter, see if we should be here
 614   // at all.  We've come from compiled code and are attempting to jump to the
 615   // interpreter, which means the caller made a static call to get here
 616   // (vcalls always get a compiled target if there is one).  Check for a
 617   // compiled target.  If there is one, we need to patch the caller's call.
 618   patch_callers_callsite(masm);
 619 
 620   __ bind(skip_fixup);
 621 
 622   // Since all args are passed on the stack, total_args_passed *
 623   // Interpreter::stackElementSize is the space we need.
 624 
 625   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 626 
 627   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 628 
 629   // stack is aligned, keep it that way
 630   // This is not currently needed or enforced by the interpreter, but
 631   // we might as well conform to the ABI.
 632   extraspace = align_up(extraspace, 2*wordSize);
 633 
 634   // set senderSP value
 635   __ lea(r13, Address(rsp, wordSize));
 636 
 637 #ifdef ASSERT
 638   __ check_stack_alignment(r13, "sender stack not aligned");
 639 #endif
 640   if (extraspace > 0) {
 641     // Pop the return address
 642     __ pop(rax);
 643 
 644     __ subptr(rsp, extraspace);
 645 
 646     // Push the return address
 647     __ push(rax);
 648 
 649     // Account for the return address location since we store it first rather
 650     // than hold it in a register across all the shuffling
 651     extraspace += wordSize;
 652   }
 653 
 654 #ifdef ASSERT
 655   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 656 #endif
 657 
 658   // Now write the args into the outgoing interpreter space
 659   for (int i = 0; i < total_args_passed; i++) {
 660     if (sig_bt[i] == T_VOID) {
 661       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 662       continue;
 663     }
 664 
 665     // offset to start parameters
 666     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 667     int next_off = st_off - Interpreter::stackElementSize;
 668 
 669     // Say 4 args:
 670     // i   st_off
 671     // 0   32 T_LONG
 672     // 1   24 T_VOID
 673     // 2   16 T_OBJECT
 674     // 3    8 T_BOOL
 675     // -    0 return address
 676     //
 677     // However to make thing extra confusing. Because we can fit a long/double in
 678     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 679     // leaves one slot empty and only stores to a single slot. In this case the
 680     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 681 
 682     VMReg r_1 = regs[i].first();
 683     VMReg r_2 = regs[i].second();
 684     if (!r_1->is_valid()) {
 685       assert(!r_2->is_valid(), "");
 686       continue;
 687     }
 688     if (r_1->is_stack()) {
 689       // memory to memory use rax
 690       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 691       if (!r_2->is_valid()) {
 692         // sign extend??
 693         __ movl(rax, Address(rsp, ld_off));
 694         __ movptr(Address(rsp, st_off), rax);
 695 
 696       } else {
 697 
 698         __ movq(rax, Address(rsp, ld_off));
 699 
 700         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 701         // T_DOUBLE and T_LONG use two slots in the interpreter
 702         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 703           // ld_off == LSW, ld_off+wordSize == MSW
 704           // st_off == MSW, next_off == LSW
 705           __ movq(Address(rsp, next_off), rax);
 706 #ifdef ASSERT
 707           // Overwrite the unused slot with known junk
 708           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 709           __ movptr(Address(rsp, st_off), rax);
 710 #endif /* ASSERT */
 711         } else {
 712           __ movq(Address(rsp, st_off), rax);
 713         }
 714       }
 715     } else if (r_1->is_Register()) {
 716       Register r = r_1->as_Register();
 717       if (!r_2->is_valid()) {
 718         // must be only an int (or less ) so move only 32bits to slot
 719         // why not sign extend??
 720         __ movl(Address(rsp, st_off), r);
 721       } else {
 722         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 723         // T_DOUBLE and T_LONG use two slots in the interpreter
 724         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 725           // long/double in gpr
 726 #ifdef ASSERT
 727           // Overwrite the unused slot with known junk
 728           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 729           __ movptr(Address(rsp, st_off), rax);
 730 #endif /* ASSERT */
 731           __ movq(Address(rsp, next_off), r);
 732         } else {
 733           __ movptr(Address(rsp, st_off), r);
 734         }
 735       }
 736     } else {
 737       assert(r_1->is_XMMRegister(), "");
 738       if (!r_2->is_valid()) {
 739         // only a float use just part of the slot
 740         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 741       } else {
 742 #ifdef ASSERT
 743         // Overwrite the unused slot with known junk
 744         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 745         __ movptr(Address(rsp, st_off), rax);
 746 #endif /* ASSERT */
 747         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 748       }
 749     }
 750   }
 751 
 752   // Schedule the branch target address early.
 753   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 754   __ jmp(rcx);
 755 }
 756 
 757 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 758                         address code_start, address code_end,
 759                         Label& L_ok) {
 760   Label L_fail;
 761   __ lea(temp_reg, ExternalAddress(code_start));
 762   __ cmpptr(pc_reg, temp_reg);
 763   __ jcc(Assembler::belowEqual, L_fail);
 764   __ lea(temp_reg, ExternalAddress(code_end));
 765   __ cmpptr(pc_reg, temp_reg);
 766   __ jcc(Assembler::below, L_ok);
 767   __ bind(L_fail);
 768 }
 769 
 770 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 771                                     int total_args_passed,
 772                                     int comp_args_on_stack,
 773                                     const BasicType *sig_bt,
 774                                     const VMRegPair *regs) {
 775 
 776   // Note: r13 contains the senderSP on entry. We must preserve it since
 777   // we may do a i2c -> c2i transition if we lose a race where compiled
 778   // code goes non-entrant while we get args ready.
 779   // In addition we use r13 to locate all the interpreter args as
 780   // we must align the stack to 16 bytes on an i2c entry else we
 781   // lose alignment we expect in all compiled code and register
 782   // save code can segv when fxsave instructions find improperly
 783   // aligned stack pointer.
 784 
 785   // Adapters can be frameless because they do not require the caller
 786   // to perform additional cleanup work, such as correcting the stack pointer.
 787   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 788   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 789   // even if a callee has modified the stack pointer.
 790   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 791   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 792   // up via the senderSP register).
 793   // In other words, if *either* the caller or callee is interpreted, we can
 794   // get the stack pointer repaired after a call.
 795   // This is why c2i and i2c adapters cannot be indefinitely composed.
 796   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 797   // both caller and callee would be compiled methods, and neither would
 798   // clean up the stack pointer changes performed by the two adapters.
 799   // If this happens, control eventually transfers back to the compiled
 800   // caller, but with an uncorrected stack, causing delayed havoc.
 801 
 802   if (VerifyAdapterCalls &&
 803       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 804     // So, let's test for cascading c2i/i2c adapters right now.
 805     //  assert(Interpreter::contains($return_addr) ||
 806     //         StubRoutines::contains($return_addr),
 807     //         "i2c adapter must return to an interpreter frame");
 808     __ block_comment("verify_i2c { ");
 809     // Pick up the return address
 810     __ movptr(rax, Address(rsp, 0));
 811     Label L_ok;
 812     if (Interpreter::code() != nullptr) {
 813       range_check(masm, rax, r11,
 814                   Interpreter::code()->code_start(),
 815                   Interpreter::code()->code_end(),
 816                   L_ok);
 817     }
 818     if (StubRoutines::initial_stubs_code() != nullptr) {
 819       range_check(masm, rax, r11,
 820                   StubRoutines::initial_stubs_code()->code_begin(),
 821                   StubRoutines::initial_stubs_code()->code_end(),
 822                   L_ok);
 823     }
 824     if (StubRoutines::final_stubs_code() != nullptr) {
 825       range_check(masm, rax, r11,
 826                   StubRoutines::final_stubs_code()->code_begin(),
 827                   StubRoutines::final_stubs_code()->code_end(),
 828                   L_ok);
 829     }
 830     const char* msg = "i2c adapter must return to an interpreter frame";
 831     __ block_comment(msg);
 832     __ stop(msg);
 833     __ bind(L_ok);
 834     __ block_comment("} verify_i2ce ");
 835   }
 836 
 837   // Must preserve original SP for loading incoming arguments because
 838   // we need to align the outgoing SP for compiled code.
 839   __ movptr(r11, rsp);
 840 
 841   // Pick up the return address
 842   __ pop(rax);
 843 
 844   // Convert 4-byte c2 stack slots to words.
 845   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 846 
 847   if (comp_args_on_stack) {
 848     __ subptr(rsp, comp_words_on_stack * wordSize);
 849   }
 850 
 851   // Ensure compiled code always sees stack at proper alignment
 852   __ andptr(rsp, -16);
 853 
 854   // push the return address and misalign the stack that youngest frame always sees
 855   // as far as the placement of the call instruction
 856   __ push(rax);
 857 
 858   // Put saved SP in another register
 859   const Register saved_sp = rax;
 860   __ movptr(saved_sp, r11);
 861 
 862   // Will jump to the compiled code just as if compiled code was doing it.
 863   // Pre-load the register-jump target early, to schedule it better.
 864   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 865 
 866 #if INCLUDE_JVMCI
 867   if (EnableJVMCI) {
 868     // check if this call should be routed towards a specific entry point
 869     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 870     Label no_alternative_target;
 871     __ jcc(Assembler::equal, no_alternative_target);
 872     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 873     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 874     __ bind(no_alternative_target);
 875   }
 876 #endif // INCLUDE_JVMCI
 877 
 878   // Now generate the shuffle code.  Pick up all register args and move the
 879   // rest through the floating point stack top.
 880   for (int i = 0; i < total_args_passed; i++) {
 881     if (sig_bt[i] == T_VOID) {
 882       // Longs and doubles are passed in native word order, but misaligned
 883       // in the 32-bit build.
 884       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 885       continue;
 886     }
 887 
 888     // Pick up 0, 1 or 2 words from SP+offset.
 889 
 890     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 891             "scrambled load targets?");
 892     // Load in argument order going down.
 893     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 894     // Point to interpreter value (vs. tag)
 895     int next_off = ld_off - Interpreter::stackElementSize;
 896     //
 897     //
 898     //
 899     VMReg r_1 = regs[i].first();
 900     VMReg r_2 = regs[i].second();
 901     if (!r_1->is_valid()) {
 902       assert(!r_2->is_valid(), "");
 903       continue;
 904     }
 905     if (r_1->is_stack()) {
 906       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 907       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 908 
 909       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 910       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 911       // will be generated.
 912       if (!r_2->is_valid()) {
 913         // sign extend???
 914         __ movl(r13, Address(saved_sp, ld_off));
 915         __ movptr(Address(rsp, st_off), r13);
 916       } else {
 917         //
 918         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 919         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 920         // So we must adjust where to pick up the data to match the interpreter.
 921         //
 922         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 923         // are accessed as negative so LSW is at LOW address
 924 
 925         // ld_off is MSW so get LSW
 926         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 927                            next_off : ld_off;
 928         __ movq(r13, Address(saved_sp, offset));
 929         // st_off is LSW (i.e. reg.first())
 930         __ movq(Address(rsp, st_off), r13);
 931       }
 932     } else if (r_1->is_Register()) {  // Register argument
 933       Register r = r_1->as_Register();
 934       assert(r != rax, "must be different");
 935       if (r_2->is_valid()) {
 936         //
 937         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 938         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 939         // So we must adjust where to pick up the data to match the interpreter.
 940 
 941         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 942                            next_off : ld_off;
 943 
 944         // this can be a misaligned move
 945         __ movq(r, Address(saved_sp, offset));
 946       } else {
 947         // sign extend and use a full word?
 948         __ movl(r, Address(saved_sp, ld_off));
 949       }
 950     } else {
 951       if (!r_2->is_valid()) {
 952         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 953       } else {
 954         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 955       }
 956     }
 957   }
 958 
 959   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 960 
 961   // 6243940 We might end up in handle_wrong_method if
 962   // the callee is deoptimized as we race thru here. If that
 963   // happens we don't want to take a safepoint because the
 964   // caller frame will look interpreted and arguments are now
 965   // "compiled" so it is much better to make this transition
 966   // invisible to the stack walking code. Unfortunately if
 967   // we try and find the callee by normal means a safepoint
 968   // is possible. So we stash the desired callee in the thread
 969   // and the vm will find there should this case occur.
 970 
 971   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 972 
 973   // put Method* where a c2i would expect should we end up there
 974   // only needed because eof c2 resolve stubs return Method* as a result in
 975   // rax
 976   __ mov(rax, rbx);
 977   __ jmp(r11);
 978 }
 979 
 980 // ---------------------------------------------------------------
 981 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 982                                                             int total_args_passed,
 983                                                             int comp_args_on_stack,
 984                                                             const BasicType *sig_bt,
 985                                                             const VMRegPair *regs,
 986                                                             AdapterFingerPrint* fingerprint) {
 987   address i2c_entry = __ pc();
 988 
 989   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 990 
 991   // -------------------------------------------------------------------------
 992   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 993   // to the interpreter.  The args start out packed in the compiled layout.  They
 994   // need to be unpacked into the interpreter layout.  This will almost always
 995   // require some stack space.  We grow the current (compiled) stack, then repack
 996   // the args.  We  finally end in a jump to the generic interpreter entry point.
 997   // On exit from the interpreter, the interpreter will restore our SP (lest the
 998   // compiled code, which relies solely on SP and not RBP, get sick).
 999 
1000   address c2i_unverified_entry = __ pc();
1001   Label skip_fixup;
1002 
1003   Register data = rax;
1004   Register receiver = j_rarg0;
1005   Register temp = rbx;
1006 
1007   {
1008     __ ic_check(1 /* end_alignment */);
1009     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1010     // Method might have been compiled since the call site was patched to
1011     // interpreted if that is the case treat it as a miss so we can get
1012     // the call site corrected.
1013     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1014     __ jcc(Assembler::equal, skip_fixup);
1015     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1016   }
1017 
1018   address c2i_entry = __ pc();
1019 
1020   // Class initialization barrier for static methods
1021   address c2i_no_clinit_check_entry = nullptr;
1022   if (VM_Version::supports_fast_class_init_checks()) {
1023     Label L_skip_barrier;
1024     Register method = rbx;
1025 
1026     { // Bypass the barrier for non-static methods
1027       Register flags = rscratch1;
1028       __ movl(flags, Address(method, Method::access_flags_offset()));
1029       __ testl(flags, JVM_ACC_STATIC);
1030       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1031     }
1032 
1033     Register klass = rscratch1;
1034     __ load_method_holder(klass, method);
1035     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1036 
1037     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1038 
1039     __ bind(L_skip_barrier);
1040     c2i_no_clinit_check_entry = __ pc();
1041   }
1042 
1043   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1044   bs->c2i_entry_barrier(masm);
1045 
1046   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1047 
1048   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1049 }
1050 
1051 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1052                                          VMRegPair *regs,
1053                                          int total_args_passed) {
1054 
1055 // We return the amount of VMRegImpl stack slots we need to reserve for all
1056 // the arguments NOT counting out_preserve_stack_slots.
1057 
1058 // NOTE: These arrays will have to change when c1 is ported
1059 #ifdef _WIN64
1060     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1061       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1062     };
1063     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1064       c_farg0, c_farg1, c_farg2, c_farg3
1065     };
1066 #else
1067     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1068       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1069     };
1070     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1071       c_farg0, c_farg1, c_farg2, c_farg3,
1072       c_farg4, c_farg5, c_farg6, c_farg7
1073     };
1074 #endif // _WIN64
1075 
1076 
1077     uint int_args = 0;
1078     uint fp_args = 0;
1079     uint stk_args = 0; // inc by 2 each time
1080 
1081     for (int i = 0; i < total_args_passed; i++) {
1082       switch (sig_bt[i]) {
1083       case T_BOOLEAN:
1084       case T_CHAR:
1085       case T_BYTE:
1086       case T_SHORT:
1087       case T_INT:
1088         if (int_args < Argument::n_int_register_parameters_c) {
1089           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1090 #ifdef _WIN64
1091           fp_args++;
1092           // Allocate slots for callee to stuff register args the stack.
1093           stk_args += 2;
1094 #endif
1095         } else {
1096           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1097           stk_args += 2;
1098         }
1099         break;
1100       case T_LONG:
1101         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1102         // fall through
1103       case T_OBJECT:
1104       case T_ARRAY:
1105       case T_ADDRESS:
1106       case T_METADATA:
1107         if (int_args < Argument::n_int_register_parameters_c) {
1108           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1109 #ifdef _WIN64
1110           fp_args++;
1111           stk_args += 2;
1112 #endif
1113         } else {
1114           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1115           stk_args += 2;
1116         }
1117         break;
1118       case T_FLOAT:
1119         if (fp_args < Argument::n_float_register_parameters_c) {
1120           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1121 #ifdef _WIN64
1122           int_args++;
1123           // Allocate slots for callee to stuff register args the stack.
1124           stk_args += 2;
1125 #endif
1126         } else {
1127           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1128           stk_args += 2;
1129         }
1130         break;
1131       case T_DOUBLE:
1132         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1133         if (fp_args < Argument::n_float_register_parameters_c) {
1134           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1135 #ifdef _WIN64
1136           int_args++;
1137           // Allocate slots for callee to stuff register args the stack.
1138           stk_args += 2;
1139 #endif
1140         } else {
1141           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1142           stk_args += 2;
1143         }
1144         break;
1145       case T_VOID: // Halves of longs and doubles
1146         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1147         regs[i].set_bad();
1148         break;
1149       default:
1150         ShouldNotReachHere();
1151         break;
1152       }
1153     }
1154 #ifdef _WIN64
1155   // windows abi requires that we always allocate enough stack space
1156   // for 4 64bit registers to be stored down.
1157   if (stk_args < 8) {
1158     stk_args = 8;
1159   }
1160 #endif // _WIN64
1161 
1162   return stk_args;
1163 }
1164 
1165 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1166                                              uint num_bits,
1167                                              uint total_args_passed) {
1168   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1169          "only certain vector sizes are supported for now");
1170 
1171   static const XMMRegister VEC_ArgReg[32] = {
1172      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1173      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1174     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1175     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1176   };
1177 
1178   uint stk_args = 0;
1179   uint fp_args = 0;
1180 
1181   for (uint i = 0; i < total_args_passed; i++) {
1182     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1183     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1184     regs[i].set_pair(vmreg->next(next_val), vmreg);
1185   }
1186 
1187   return stk_args;
1188 }
1189 
1190 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1191   // We always ignore the frame_slots arg and just use the space just below frame pointer
1192   // which by this time is free to use
1193   switch (ret_type) {
1194   case T_FLOAT:
1195     __ movflt(Address(rbp, -wordSize), xmm0);
1196     break;
1197   case T_DOUBLE:
1198     __ movdbl(Address(rbp, -wordSize), xmm0);
1199     break;
1200   case T_VOID:  break;
1201   default: {
1202     __ movptr(Address(rbp, -wordSize), rax);
1203     }
1204   }
1205 }
1206 
1207 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1208   // We always ignore the frame_slots arg and just use the space just below frame pointer
1209   // which by this time is free to use
1210   switch (ret_type) {
1211   case T_FLOAT:
1212     __ movflt(xmm0, Address(rbp, -wordSize));
1213     break;
1214   case T_DOUBLE:
1215     __ movdbl(xmm0, Address(rbp, -wordSize));
1216     break;
1217   case T_VOID:  break;
1218   default: {
1219     __ movptr(rax, Address(rbp, -wordSize));
1220     }
1221   }
1222 }
1223 
1224 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1225     for ( int i = first_arg ; i < arg_count ; i++ ) {
1226       if (args[i].first()->is_Register()) {
1227         __ push(args[i].first()->as_Register());
1228       } else if (args[i].first()->is_XMMRegister()) {
1229         __ subptr(rsp, 2*wordSize);
1230         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1231       }
1232     }
1233 }
1234 
1235 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1236     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1237       if (args[i].first()->is_Register()) {
1238         __ pop(args[i].first()->as_Register());
1239       } else if (args[i].first()->is_XMMRegister()) {
1240         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1241         __ addptr(rsp, 2*wordSize);
1242       }
1243     }
1244 }
1245 
1246 static void verify_oop_args(MacroAssembler* masm,
1247                             const methodHandle& method,
1248                             const BasicType* sig_bt,
1249                             const VMRegPair* regs) {
1250   Register temp_reg = rbx;  // not part of any compiled calling seq
1251   if (VerifyOops) {
1252     for (int i = 0; i < method->size_of_parameters(); i++) {
1253       if (is_reference_type(sig_bt[i])) {
1254         VMReg r = regs[i].first();
1255         assert(r->is_valid(), "bad oop arg");
1256         if (r->is_stack()) {
1257           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1258           __ verify_oop(temp_reg);
1259         } else {
1260           __ verify_oop(r->as_Register());
1261         }
1262       }
1263     }
1264   }
1265 }
1266 
1267 static void check_continuation_enter_argument(VMReg actual_vmreg,
1268                                               Register expected_reg,
1269                                               const char* name) {
1270   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1271   assert(actual_vmreg->as_Register() == expected_reg,
1272          "%s is in unexpected register: %s instead of %s",
1273          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1274 }
1275 
1276 
1277 //---------------------------- continuation_enter_setup ---------------------------
1278 //
1279 // Arguments:
1280 //   None.
1281 //
1282 // Results:
1283 //   rsp: pointer to blank ContinuationEntry
1284 //
1285 // Kills:
1286 //   rax
1287 //
1288 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1289   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1290   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1291   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1292 
1293   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1294   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1295 
1296   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1297   OopMap* map = new OopMap(frame_size, 0);
1298 
1299   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1300   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1301   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1302 
1303   return map;
1304 }
1305 
1306 //---------------------------- fill_continuation_entry ---------------------------
1307 //
1308 // Arguments:
1309 //   rsp: pointer to blank Continuation entry
1310 //   reg_cont_obj: pointer to the continuation
1311 //   reg_flags: flags
1312 //
1313 // Results:
1314 //   rsp: pointer to filled out ContinuationEntry
1315 //
1316 // Kills:
1317 //   rax
1318 //
1319 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1320   assert_different_registers(rax, reg_cont_obj, reg_flags);
1321 #ifdef ASSERT
1322   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1323 #endif
1324   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1325   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1326   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1327   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1328   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1329 
1330   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1331   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1332   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1333   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1334 
1335   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1336   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1337 }
1338 
1339 //---------------------------- continuation_enter_cleanup ---------------------------
1340 //
1341 // Arguments:
1342 //   rsp: pointer to the ContinuationEntry
1343 //
1344 // Results:
1345 //   rsp: pointer to the spilled rbp in the entry frame
1346 //
1347 // Kills:
1348 //   rbx
1349 //
1350 static void continuation_enter_cleanup(MacroAssembler* masm) {
1351 #ifdef ASSERT
1352   Label L_good_sp;
1353   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1354   __ jcc(Assembler::equal, L_good_sp);
1355   __ stop("Incorrect rsp at continuation_enter_cleanup");
1356   __ bind(L_good_sp);
1357 #endif
1358   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1359   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1360 
1361   if (CheckJNICalls) {
1362     // Check if this is a virtual thread continuation
1363     Label L_skip_vthread_code;
1364     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1365     __ jcc(Assembler::equal, L_skip_vthread_code);
1366 
1367     // If the held monitor count is > 0 and this vthread is terminating then
1368     // it failed to release a JNI monitor. So we issue the same log message
1369     // that JavaThread::exit does.
1370     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1371     __ jcc(Assembler::equal, L_skip_vthread_code);
1372 
1373     // rax may hold an exception oop, save it before the call
1374     __ push(rax);
1375     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1376     __ pop(rax);
1377 
1378     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1379     // on termination. The held count is implicitly zeroed below when we restore from
1380     // the parent held count (which has to be zero).
1381     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1382 
1383     __ bind(L_skip_vthread_code);
1384   }
1385 #ifdef ASSERT
1386   else {
1387     // Check if this is a virtual thread continuation
1388     Label L_skip_vthread_code;
1389     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1390     __ jcc(Assembler::equal, L_skip_vthread_code);
1391 
1392     // See comment just above. If not checking JNI calls the JNI count is only
1393     // needed for assertion checking.
1394     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1395 
1396     __ bind(L_skip_vthread_code);
1397   }
1398 #endif
1399 
1400   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1401   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1402 
1403   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1404   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1405   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1406 }
1407 
1408 static void gen_continuation_enter(MacroAssembler* masm,
1409                                    const VMRegPair* regs,
1410                                    int& exception_offset,
1411                                    OopMapSet* oop_maps,
1412                                    int& frame_complete,
1413                                    int& stack_slots,
1414                                    int& interpreted_entry_offset,
1415                                    int& compiled_entry_offset) {
1416 
1417   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1418   int pos_cont_obj   = 0;
1419   int pos_is_cont    = 1;
1420   int pos_is_virtual = 2;
1421 
1422   // The platform-specific calling convention may present the arguments in various registers.
1423   // To simplify the rest of the code, we expect the arguments to reside at these known
1424   // registers, and we additionally check the placement here in case calling convention ever
1425   // changes.
1426   Register reg_cont_obj   = c_rarg1;
1427   Register reg_is_cont    = c_rarg2;
1428   Register reg_is_virtual = c_rarg3;
1429 
1430   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1431   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1432   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1433 
1434   // Utility methods kill rax, make sure there are no collisions
1435   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1436 
1437   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1438                          relocInfo::static_call_type);
1439 
1440   address start = __ pc();
1441 
1442   Label L_thaw, L_exit;
1443 
1444   // i2i entry used at interp_only_mode only
1445   interpreted_entry_offset = __ pc() - start;
1446   {
1447 #ifdef ASSERT
1448     Label is_interp_only;
1449     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1450     __ jcc(Assembler::notEqual, is_interp_only);
1451     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1452     __ bind(is_interp_only);
1453 #endif
1454 
1455     __ pop(rax); // return address
1456     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1457     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1458     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1459     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1460     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1461     __ push(rax); // return address
1462     __ push_cont_fastpath();
1463 
1464     __ enter();
1465 
1466     stack_slots = 2; // will be adjusted in setup
1467     OopMap* map = continuation_enter_setup(masm, stack_slots);
1468     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1469     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1470 
1471     __ verify_oop(reg_cont_obj);
1472 
1473     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1474 
1475     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1476     __ testptr(reg_is_cont, reg_is_cont);
1477     __ jcc(Assembler::notZero, L_thaw);
1478 
1479     // --- Resolve path
1480 
1481     // Make sure the call is patchable
1482     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1483     // Emit stub for static call
1484     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1485     if (stub == nullptr) {
1486       fatal("CodeCache is full at gen_continuation_enter");
1487     }
1488     __ call(resolve);
1489     oop_maps->add_gc_map(__ pc() - start, map);
1490     __ post_call_nop();
1491 
1492     __ jmp(L_exit);
1493   }
1494 
1495   // compiled entry
1496   __ align(CodeEntryAlignment);
1497   compiled_entry_offset = __ pc() - start;
1498   __ enter();
1499 
1500   stack_slots = 2; // will be adjusted in setup
1501   OopMap* map = continuation_enter_setup(masm, stack_slots);
1502 
1503   // Frame is now completed as far as size and linkage.
1504   frame_complete = __ pc() - start;
1505 
1506   __ verify_oop(reg_cont_obj);
1507 
1508   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1509 
1510   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1511   __ testptr(reg_is_cont, reg_is_cont);
1512   __ jccb(Assembler::notZero, L_thaw);
1513 
1514   // --- call Continuation.enter(Continuation c, boolean isContinue)
1515 
1516   // Make sure the call is patchable
1517   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1518 
1519   // Emit stub for static call
1520   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1521   if (stub == nullptr) {
1522     fatal("CodeCache is full at gen_continuation_enter");
1523   }
1524 
1525   // The call needs to be resolved. There's a special case for this in
1526   // SharedRuntime::find_callee_info_helper() which calls
1527   // LinkResolver::resolve_continuation_enter() which resolves the call to
1528   // Continuation.enter(Continuation c, boolean isContinue).
1529   __ call(resolve);
1530 
1531   oop_maps->add_gc_map(__ pc() - start, map);
1532   __ post_call_nop();
1533 
1534   __ jmpb(L_exit);
1535 
1536   // --- Thawing path
1537 
1538   __ bind(L_thaw);
1539 
1540   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1541   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1542 
1543   ContinuationEntry::_return_pc_offset = __ pc() - start;
1544   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1545   __ post_call_nop();
1546 
1547   // --- Normal exit (resolve/thawing)
1548 
1549   __ bind(L_exit);
1550 
1551   continuation_enter_cleanup(masm);
1552   __ pop(rbp);
1553   __ ret(0);
1554 
1555   // --- Exception handling path
1556 
1557   exception_offset = __ pc() - start;
1558 
1559   continuation_enter_cleanup(masm);
1560   __ pop(rbp);
1561 
1562   __ movptr(c_rarg0, r15_thread);
1563   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1564 
1565   // rax still holds the original exception oop, save it before the call
1566   __ push(rax);
1567 
1568   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1569   __ movptr(rbx, rax);
1570 
1571   // Continue at exception handler:
1572   //   rax: exception oop
1573   //   rbx: exception handler
1574   //   rdx: exception pc
1575   __ pop(rax);
1576   __ verify_oop(rax);
1577   __ pop(rdx);
1578   __ jmp(rbx);
1579 }
1580 
1581 static void gen_continuation_yield(MacroAssembler* masm,
1582                                    const VMRegPair* regs,
1583                                    OopMapSet* oop_maps,
1584                                    int& frame_complete,
1585                                    int& stack_slots,
1586                                    int& compiled_entry_offset) {
1587   enum layout {
1588     rbp_off,
1589     rbpH_off,
1590     return_off,
1591     return_off2,
1592     framesize // inclusive of return address
1593   };
1594   stack_slots = framesize /  VMRegImpl::slots_per_word;
1595   assert(stack_slots == 2, "recheck layout");
1596 
1597   address start = __ pc();
1598   compiled_entry_offset = __ pc() - start;
1599   __ enter();
1600   address the_pc = __ pc();
1601 
1602   frame_complete = the_pc - start;
1603 
1604   // This nop must be exactly at the PC we push into the frame info.
1605   // We use this nop for fast CodeBlob lookup, associate the OopMap
1606   // with it right away.
1607   __ post_call_nop();
1608   OopMap* map = new OopMap(framesize, 1);
1609   oop_maps->add_gc_map(frame_complete, map);
1610 
1611   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1612   __ movptr(c_rarg0, r15_thread);
1613   __ movptr(c_rarg1, rsp);
1614   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1615   __ reset_last_Java_frame(true);
1616 
1617   Label L_pinned;
1618 
1619   __ testptr(rax, rax);
1620   __ jcc(Assembler::notZero, L_pinned);
1621 
1622   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1623   continuation_enter_cleanup(masm);
1624   __ pop(rbp);
1625   __ ret(0);
1626 
1627   __ bind(L_pinned);
1628 
1629   // Pinned, return to caller
1630 
1631   // handle pending exception thrown by freeze
1632   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1633   Label ok;
1634   __ jcc(Assembler::equal, ok);
1635   __ leave();
1636   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1637   __ bind(ok);
1638 
1639   __ leave();
1640   __ ret(0);
1641 }
1642 
1643 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1644   ::continuation_enter_cleanup(masm);
1645 }
1646 
1647 static void gen_special_dispatch(MacroAssembler* masm,
1648                                  const methodHandle& method,
1649                                  const BasicType* sig_bt,
1650                                  const VMRegPair* regs) {
1651   verify_oop_args(masm, method, sig_bt, regs);
1652   vmIntrinsics::ID iid = method->intrinsic_id();
1653 
1654   // Now write the args into the outgoing interpreter space
1655   bool     has_receiver   = false;
1656   Register receiver_reg   = noreg;
1657   int      member_arg_pos = -1;
1658   Register member_reg     = noreg;
1659   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1660   if (ref_kind != 0) {
1661     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1662     member_reg = rbx;  // known to be free at this point
1663     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1664   } else if (iid == vmIntrinsics::_invokeBasic) {
1665     has_receiver = true;
1666   } else if (iid == vmIntrinsics::_linkToNative) {
1667     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1668     member_reg = rbx;  // known to be free at this point
1669   } else {
1670     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1671   }
1672 
1673   if (member_reg != noreg) {
1674     // Load the member_arg into register, if necessary.
1675     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1676     VMReg r = regs[member_arg_pos].first();
1677     if (r->is_stack()) {
1678       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1679     } else {
1680       // no data motion is needed
1681       member_reg = r->as_Register();
1682     }
1683   }
1684 
1685   if (has_receiver) {
1686     // Make sure the receiver is loaded into a register.
1687     assert(method->size_of_parameters() > 0, "oob");
1688     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1689     VMReg r = regs[0].first();
1690     assert(r->is_valid(), "bad receiver arg");
1691     if (r->is_stack()) {
1692       // Porting note:  This assumes that compiled calling conventions always
1693       // pass the receiver oop in a register.  If this is not true on some
1694       // platform, pick a temp and load the receiver from stack.
1695       fatal("receiver always in a register");
1696       receiver_reg = j_rarg0;  // known to be free at this point
1697       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1698     } else {
1699       // no data motion is needed
1700       receiver_reg = r->as_Register();
1701     }
1702   }
1703 
1704   // Figure out which address we are really jumping to:
1705   MethodHandles::generate_method_handle_dispatch(masm, iid,
1706                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1707 }
1708 
1709 // ---------------------------------------------------------------------------
1710 // Generate a native wrapper for a given method.  The method takes arguments
1711 // in the Java compiled code convention, marshals them to the native
1712 // convention (handlizes oops, etc), transitions to native, makes the call,
1713 // returns to java state (possibly blocking), unhandlizes any result and
1714 // returns.
1715 //
1716 // Critical native functions are a shorthand for the use of
1717 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1718 // functions.  The wrapper is expected to unpack the arguments before
1719 // passing them to the callee. Critical native functions leave the state _in_Java,
1720 // since they cannot stop for GC.
1721 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1722 // block and the check for pending exceptions it's impossible for them
1723 // to be thrown.
1724 //
1725 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1726                                                 const methodHandle& method,
1727                                                 int compile_id,
1728                                                 BasicType* in_sig_bt,
1729                                                 VMRegPair* in_regs,
1730                                                 BasicType ret_type) {
1731   if (method->is_continuation_native_intrinsic()) {
1732     int exception_offset = -1;
1733     OopMapSet* oop_maps = new OopMapSet();
1734     int frame_complete = -1;
1735     int stack_slots = -1;
1736     int interpreted_entry_offset = -1;
1737     int vep_offset = -1;
1738     if (method->is_continuation_enter_intrinsic()) {
1739       gen_continuation_enter(masm,
1740                              in_regs,
1741                              exception_offset,
1742                              oop_maps,
1743                              frame_complete,
1744                              stack_slots,
1745                              interpreted_entry_offset,
1746                              vep_offset);
1747     } else if (method->is_continuation_yield_intrinsic()) {
1748       gen_continuation_yield(masm,
1749                              in_regs,
1750                              oop_maps,
1751                              frame_complete,
1752                              stack_slots,
1753                              vep_offset);
1754     } else {
1755       guarantee(false, "Unknown Continuation native intrinsic");
1756     }
1757 
1758 #ifdef ASSERT
1759     if (method->is_continuation_enter_intrinsic()) {
1760       assert(interpreted_entry_offset != -1, "Must be set");
1761       assert(exception_offset != -1,         "Must be set");
1762     } else {
1763       assert(interpreted_entry_offset == -1, "Must be unset");
1764       assert(exception_offset == -1,         "Must be unset");
1765     }
1766     assert(frame_complete != -1,    "Must be set");
1767     assert(stack_slots != -1,       "Must be set");
1768     assert(vep_offset != -1,        "Must be set");
1769 #endif
1770 
1771     __ flush();
1772     nmethod* nm = nmethod::new_native_nmethod(method,
1773                                               compile_id,
1774                                               masm->code(),
1775                                               vep_offset,
1776                                               frame_complete,
1777                                               stack_slots,
1778                                               in_ByteSize(-1),
1779                                               in_ByteSize(-1),
1780                                               oop_maps,
1781                                               exception_offset);
1782     if (nm == nullptr) return nm;
1783     if (method->is_continuation_enter_intrinsic()) {
1784       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1785     } else if (method->is_continuation_yield_intrinsic()) {
1786       _cont_doYield_stub = nm;
1787     }
1788     return nm;
1789   }
1790 
1791   if (method->is_method_handle_intrinsic()) {
1792     vmIntrinsics::ID iid = method->intrinsic_id();
1793     intptr_t start = (intptr_t)__ pc();
1794     int vep_offset = ((intptr_t)__ pc()) - start;
1795     gen_special_dispatch(masm,
1796                          method,
1797                          in_sig_bt,
1798                          in_regs);
1799     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1800     __ flush();
1801     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1802     return nmethod::new_native_nmethod(method,
1803                                        compile_id,
1804                                        masm->code(),
1805                                        vep_offset,
1806                                        frame_complete,
1807                                        stack_slots / VMRegImpl::slots_per_word,
1808                                        in_ByteSize(-1),
1809                                        in_ByteSize(-1),
1810                                        nullptr);
1811   }
1812   address native_func = method->native_function();
1813   assert(native_func != nullptr, "must have function");
1814 
1815   // An OopMap for lock (and class if static)
1816   OopMapSet *oop_maps = new OopMapSet();
1817   intptr_t start = (intptr_t)__ pc();
1818 
1819   // We have received a description of where all the java arg are located
1820   // on entry to the wrapper. We need to convert these args to where
1821   // the jni function will expect them. To figure out where they go
1822   // we convert the java signature to a C signature by inserting
1823   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1824 
1825   const int total_in_args = method->size_of_parameters();
1826   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1827 
1828   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1829   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1830   BasicType* in_elem_bt = nullptr;
1831 
1832   int argc = 0;
1833   out_sig_bt[argc++] = T_ADDRESS;
1834   if (method->is_static()) {
1835     out_sig_bt[argc++] = T_OBJECT;
1836   }
1837 
1838   for (int i = 0; i < total_in_args ; i++ ) {
1839     out_sig_bt[argc++] = in_sig_bt[i];
1840   }
1841 
1842   // Now figure out where the args must be stored and how much stack space
1843   // they require.
1844   int out_arg_slots;
1845   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1846 
1847   // Compute framesize for the wrapper.  We need to handlize all oops in
1848   // incoming registers
1849 
1850   // Calculate the total number of stack slots we will need.
1851 
1852   // First count the abi requirement plus all of the outgoing args
1853   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1854 
1855   // Now the space for the inbound oop handle area
1856   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1857 
1858   int oop_handle_offset = stack_slots;
1859   stack_slots += total_save_slots;
1860 
1861   // Now any space we need for handlizing a klass if static method
1862 
1863   int klass_slot_offset = 0;
1864   int klass_offset = -1;
1865   int lock_slot_offset = 0;
1866   bool is_static = false;
1867 
1868   if (method->is_static()) {
1869     klass_slot_offset = stack_slots;
1870     stack_slots += VMRegImpl::slots_per_word;
1871     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1872     is_static = true;
1873   }
1874 
1875   // Plus a lock if needed
1876 
1877   if (method->is_synchronized()) {
1878     lock_slot_offset = stack_slots;
1879     stack_slots += VMRegImpl::slots_per_word;
1880   }
1881 
1882   // Now a place (+2) to save return values or temp during shuffling
1883   // + 4 for return address (which we own) and saved rbp
1884   stack_slots += 6;
1885 
1886   // Ok The space we have allocated will look like:
1887   //
1888   //
1889   // FP-> |                     |
1890   //      |---------------------|
1891   //      | 2 slots for moves   |
1892   //      |---------------------|
1893   //      | lock box (if sync)  |
1894   //      |---------------------| <- lock_slot_offset
1895   //      | klass (if static)   |
1896   //      |---------------------| <- klass_slot_offset
1897   //      | oopHandle area      |
1898   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1899   //      | outbound memory     |
1900   //      | based arguments     |
1901   //      |                     |
1902   //      |---------------------|
1903   //      |                     |
1904   // SP-> | out_preserved_slots |
1905   //
1906   //
1907 
1908 
1909   // Now compute actual number of stack words we need rounding to make
1910   // stack properly aligned.
1911   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1912 
1913   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1914 
1915   // First thing make an ic check to see if we should even be here
1916 
1917   // We are free to use all registers as temps without saving them and
1918   // restoring them except rbp. rbp is the only callee save register
1919   // as far as the interpreter and the compiler(s) are concerned.
1920 
1921   const Register receiver = j_rarg0;
1922 
1923   Label exception_pending;
1924 
1925   assert_different_registers(receiver, rscratch1, rscratch2);
1926   __ verify_oop(receiver);
1927   __ ic_check(8 /* end_alignment */);
1928 
1929   int vep_offset = ((intptr_t)__ pc()) - start;
1930 
1931   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1932     Label L_skip_barrier;
1933     Register klass = r10;
1934     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1935     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1936 
1937     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1938 
1939     __ bind(L_skip_barrier);
1940   }
1941 
1942 #ifdef COMPILER1
1943   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1944   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1945     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1946   }
1947 #endif // COMPILER1
1948 
1949   // The instruction at the verified entry point must be 5 bytes or longer
1950   // because it can be patched on the fly by make_non_entrant. The stack bang
1951   // instruction fits that requirement.
1952 
1953   // Generate stack overflow check
1954   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1955 
1956   // Generate a new frame for the wrapper.
1957   __ enter();
1958   // -2 because return address is already present and so is saved rbp
1959   __ subptr(rsp, stack_size - 2*wordSize);
1960 
1961   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1962   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1963   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1964 
1965   // Frame is now completed as far as size and linkage.
1966   int frame_complete = ((intptr_t)__ pc()) - start;
1967 
1968     if (UseRTMLocking) {
1969       // Abort RTM transaction before calling JNI
1970       // because critical section will be large and will be
1971       // aborted anyway. Also nmethod could be deoptimized.
1972       __ xabort(0);
1973     }
1974 
1975 #ifdef ASSERT
1976   __ check_stack_alignment(rsp, "improperly aligned stack");
1977 #endif /* ASSERT */
1978 
1979 
1980   // We use r14 as the oop handle for the receiver/klass
1981   // It is callee save so it survives the call to native
1982 
1983   const Register oop_handle_reg = r14;
1984 
1985   //
1986   // We immediately shuffle the arguments so that any vm call we have to
1987   // make from here on out (sync slow path, jvmti, etc.) we will have
1988   // captured the oops from our caller and have a valid oopMap for
1989   // them.
1990 
1991   // -----------------
1992   // The Grand Shuffle
1993 
1994   // The Java calling convention is either equal (linux) or denser (win64) than the
1995   // c calling convention. However the because of the jni_env argument the c calling
1996   // convention always has at least one more (and two for static) arguments than Java.
1997   // Therefore if we move the args from java -> c backwards then we will never have
1998   // a register->register conflict and we don't have to build a dependency graph
1999   // and figure out how to break any cycles.
2000   //
2001 
2002   // Record esp-based slot for receiver on stack for non-static methods
2003   int receiver_offset = -1;
2004 
2005   // This is a trick. We double the stack slots so we can claim
2006   // the oops in the caller's frame. Since we are sure to have
2007   // more args than the caller doubling is enough to make
2008   // sure we can capture all the incoming oop args from the
2009   // caller.
2010   //
2011   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2012 
2013   // Mark location of rbp (someday)
2014   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2015 
2016   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2017   // All inbound args are referenced based on rbp and all outbound args via rsp.
2018 
2019 
2020 #ifdef ASSERT
2021   bool reg_destroyed[Register::number_of_registers];
2022   bool freg_destroyed[XMMRegister::number_of_registers];
2023   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2024     reg_destroyed[r] = false;
2025   }
2026   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2027     freg_destroyed[f] = false;
2028   }
2029 
2030 #endif /* ASSERT */
2031 
2032   // For JNI natives the incoming and outgoing registers are offset upwards.
2033   GrowableArray<int> arg_order(2 * total_in_args);
2034 
2035   VMRegPair tmp_vmreg;
2036   tmp_vmreg.set2(rbx->as_VMReg());
2037 
2038   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2039     arg_order.push(i);
2040     arg_order.push(c_arg);
2041   }
2042 
2043   int temploc = -1;
2044   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2045     int i = arg_order.at(ai);
2046     int c_arg = arg_order.at(ai + 1);
2047     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2048 #ifdef ASSERT
2049     if (in_regs[i].first()->is_Register()) {
2050       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2051     } else if (in_regs[i].first()->is_XMMRegister()) {
2052       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2053     }
2054     if (out_regs[c_arg].first()->is_Register()) {
2055       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2056     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2057       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2058     }
2059 #endif /* ASSERT */
2060     switch (in_sig_bt[i]) {
2061       case T_ARRAY:
2062       case T_OBJECT:
2063         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2064                     ((i == 0) && (!is_static)),
2065                     &receiver_offset);
2066         break;
2067       case T_VOID:
2068         break;
2069 
2070       case T_FLOAT:
2071         __ float_move(in_regs[i], out_regs[c_arg]);
2072           break;
2073 
2074       case T_DOUBLE:
2075         assert( i + 1 < total_in_args &&
2076                 in_sig_bt[i + 1] == T_VOID &&
2077                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2078         __ double_move(in_regs[i], out_regs[c_arg]);
2079         break;
2080 
2081       case T_LONG :
2082         __ long_move(in_regs[i], out_regs[c_arg]);
2083         break;
2084 
2085       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2086 
2087       default:
2088         __ move32_64(in_regs[i], out_regs[c_arg]);
2089     }
2090   }
2091 
2092   int c_arg;
2093 
2094   // Pre-load a static method's oop into r14.  Used both by locking code and
2095   // the normal JNI call code.
2096   // point c_arg at the first arg that is already loaded in case we
2097   // need to spill before we call out
2098   c_arg = total_c_args - total_in_args;
2099 
2100   if (method->is_static()) {
2101 
2102     //  load oop into a register
2103     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2104 
2105     // Now handlize the static class mirror it's known not-null.
2106     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2107     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2108 
2109     // Now get the handle
2110     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2111     // store the klass handle as second argument
2112     __ movptr(c_rarg1, oop_handle_reg);
2113     // and protect the arg if we must spill
2114     c_arg--;
2115   }
2116 
2117   // Change state to native (we save the return address in the thread, since it might not
2118   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2119   // points into the right code segment. It does not have to be the correct return pc.
2120   // We use the same pc/oopMap repeatedly when we call out
2121 
2122   intptr_t the_pc = (intptr_t) __ pc();
2123   oop_maps->add_gc_map(the_pc - start, map);
2124 
2125   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2126 
2127 
2128   // We have all of the arguments setup at this point. We must not touch any register
2129   // argument registers at this point (what if we save/restore them there are no oop?
2130 
2131   {
2132     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2133     // protect the args we've loaded
2134     save_args(masm, total_c_args, c_arg, out_regs);
2135     __ mov_metadata(c_rarg1, method());
2136     __ call_VM_leaf(
2137       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2138       r15_thread, c_rarg1);
2139     restore_args(masm, total_c_args, c_arg, out_regs);
2140   }
2141 
2142   // RedefineClasses() tracing support for obsolete method entry
2143   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2144     // protect the args we've loaded
2145     save_args(masm, total_c_args, c_arg, out_regs);
2146     __ mov_metadata(c_rarg1, method());
2147     __ call_VM_leaf(
2148       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2149       r15_thread, c_rarg1);
2150     restore_args(masm, total_c_args, c_arg, out_regs);
2151   }
2152 
2153   // Lock a synchronized method
2154 
2155   // Register definitions used by locking and unlocking
2156 
2157   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2158   const Register obj_reg  = rbx;  // Will contain the oop
2159   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2160   const Register old_hdr  = r13;  // value of old header at unlock time
2161 
2162   Label slow_path_lock;
2163   Label lock_done;
2164 
2165   if (method->is_synchronized()) {
2166     Label count_mon;
2167 
2168     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2169 
2170     // Get the handle (the 2nd argument)
2171     __ mov(oop_handle_reg, c_rarg1);
2172 
2173     // Get address of the box
2174 
2175     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2176 
2177     // Load the oop from the handle
2178     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2179 
2180     if (LockingMode == LM_MONITOR) {
2181       __ jmp(slow_path_lock);
2182     } else if (LockingMode == LM_LEGACY) {
2183       // Load immediate 1 into swap_reg %rax
2184       __ movl(swap_reg, 1);
2185 
2186       // Load (object->mark() | 1) into swap_reg %rax
2187       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2188 
2189       // Save (object->mark() | 1) into BasicLock's displaced header
2190       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2191 
2192       // src -> dest iff dest == rax else rax <- dest
2193       __ lock();
2194       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2195       __ jcc(Assembler::equal, count_mon);
2196 
2197       // Hmm should this move to the slow path code area???
2198 
2199       // Test if the oopMark is an obvious stack pointer, i.e.,
2200       //  1) (mark & 3) == 0, and
2201       //  2) rsp <= mark < mark + os::pagesize()
2202       // These 3 tests can be done by evaluating the following
2203       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2204       // assuming both stack pointer and pagesize have their
2205       // least significant 2 bits clear.
2206       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2207 
2208       __ subptr(swap_reg, rsp);
2209       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2210 
2211       // Save the test result, for recursive case, the result is zero
2212       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2213       __ jcc(Assembler::notEqual, slow_path_lock);
2214     } else {
2215       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2216       __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2217     }
2218     __ jmp (lock_done);
2219 
2220     __ bind(count_mon);
2221     __ inc_held_monitor_count();
2222 
2223     // Slow path will re-enter here
2224     __ bind(lock_done);
2225   }
2226 
2227   // Finally just about ready to make the JNI call
2228 
2229   // get JNIEnv* which is first argument to native
2230   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2231 
2232   // Now set thread in native
2233   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2234 
2235   __ call(RuntimeAddress(native_func));
2236 
2237   // Verify or restore cpu control state after JNI call
2238   __ restore_cpu_control_state_after_jni(rscratch1);
2239 
2240   // Unpack native results.
2241   switch (ret_type) {
2242   case T_BOOLEAN: __ c2bool(rax);            break;
2243   case T_CHAR   : __ movzwl(rax, rax);      break;
2244   case T_BYTE   : __ sign_extend_byte (rax); break;
2245   case T_SHORT  : __ sign_extend_short(rax); break;
2246   case T_INT    : /* nothing to do */        break;
2247   case T_DOUBLE :
2248   case T_FLOAT  :
2249     // Result is in xmm0 we'll save as needed
2250     break;
2251   case T_ARRAY:                 // Really a handle
2252   case T_OBJECT:                // Really a handle
2253       break; // can't de-handlize until after safepoint check
2254   case T_VOID: break;
2255   case T_LONG: break;
2256   default       : ShouldNotReachHere();
2257   }
2258 
2259   Label after_transition;
2260 
2261   // Switch thread to "native transition" state before reading the synchronization state.
2262   // This additional state is necessary because reading and testing the synchronization
2263   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2264   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2265   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2266   //     Thread A is resumed to finish this native method, but doesn't block here since it
2267   //     didn't see any synchronization is progress, and escapes.
2268   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2269 
2270   // Force this write out before the read below
2271   if (!UseSystemMemoryBarrier) {
2272     __ membar(Assembler::Membar_mask_bits(
2273               Assembler::LoadLoad | Assembler::LoadStore |
2274               Assembler::StoreLoad | Assembler::StoreStore));
2275   }
2276 
2277   // check for safepoint operation in progress and/or pending suspend requests
2278   {
2279     Label Continue;
2280     Label slow_path;
2281 
2282     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2283 
2284     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2285     __ jcc(Assembler::equal, Continue);
2286     __ bind(slow_path);
2287 
2288     // Don't use call_VM as it will see a possible pending exception and forward it
2289     // and never return here preventing us from clearing _last_native_pc down below.
2290     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2291     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2292     // by hand.
2293     //
2294     __ vzeroupper();
2295     save_native_result(masm, ret_type, stack_slots);
2296     __ mov(c_rarg0, r15_thread);
2297     __ mov(r12, rsp); // remember sp
2298     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2299     __ andptr(rsp, -16); // align stack as required by ABI
2300     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2301     __ mov(rsp, r12); // restore sp
2302     __ reinit_heapbase();
2303     // Restore any method result value
2304     restore_native_result(masm, ret_type, stack_slots);
2305     __ bind(Continue);
2306   }
2307 
2308   // change thread state
2309   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2310   __ bind(after_transition);
2311 
2312   Label reguard;
2313   Label reguard_done;
2314   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2315   __ jcc(Assembler::equal, reguard);
2316   __ bind(reguard_done);
2317 
2318   // native result if any is live
2319 
2320   // Unlock
2321   Label slow_path_unlock;
2322   Label unlock_done;
2323   if (method->is_synchronized()) {
2324 
2325     Label fast_done;
2326 
2327     // Get locked oop from the handle we passed to jni
2328     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2329 
2330     if (LockingMode == LM_LEGACY) {
2331       Label not_recur;
2332       // Simple recursive lock?
2333       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2334       __ jcc(Assembler::notEqual, not_recur);
2335       __ jmpb(fast_done);
2336       __ bind(not_recur);
2337     }
2338 
2339     // Must save rax if it is live now because cmpxchg must use it
2340     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2341       save_native_result(masm, ret_type, stack_slots);
2342     }
2343 
2344     if (LockingMode == LM_MONITOR) {
2345       __ jmp(slow_path_unlock);
2346     } else if (LockingMode == LM_LEGACY) {
2347       // get address of the stack lock
2348       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2349       //  get old displaced header
2350       __ movptr(old_hdr, Address(rax, 0));
2351 
2352       // Atomic swap old header if oop still contains the stack lock
2353       __ lock();
2354       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2355       __ jcc(Assembler::notEqual, slow_path_unlock);
2356       __ dec_held_monitor_count();
2357     } else {
2358       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2359       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2360     }
2361 
2362     // slow path re-enters here
2363     __ bind(unlock_done);
2364     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2365       restore_native_result(masm, ret_type, stack_slots);
2366     }
2367 
2368     __ bind(fast_done);
2369   }
2370   {
2371     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2372     save_native_result(masm, ret_type, stack_slots);
2373     __ mov_metadata(c_rarg1, method());
2374     __ call_VM_leaf(
2375          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2376          r15_thread, c_rarg1);
2377     restore_native_result(masm, ret_type, stack_slots);
2378   }
2379 
2380   __ reset_last_Java_frame(false);
2381 
2382   // Unbox oop result, e.g. JNIHandles::resolve value.
2383   if (is_reference_type(ret_type)) {
2384     __ resolve_jobject(rax /* value */,
2385                        r15_thread /* thread */,
2386                        rcx /* tmp */);
2387   }
2388 
2389   if (CheckJNICalls) {
2390     // clear_pending_jni_exception_check
2391     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2392   }
2393 
2394   // reset handle block
2395   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2396   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2397 
2398   // pop our frame
2399 
2400   __ leave();
2401 
2402   // Any exception pending?
2403   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2404   __ jcc(Assembler::notEqual, exception_pending);
2405 
2406   // Return
2407 
2408   __ ret(0);
2409 
2410   // Unexpected paths are out of line and go here
2411 
2412   // forward the exception
2413   __ bind(exception_pending);
2414 
2415   // and forward the exception
2416   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2417 
2418   // Slow path locking & unlocking
2419   if (method->is_synchronized()) {
2420 
2421     // BEGIN Slow path lock
2422     __ bind(slow_path_lock);
2423 
2424     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2425     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2426 
2427     // protect the args we've loaded
2428     save_args(masm, total_c_args, c_arg, out_regs);
2429 
2430     __ mov(c_rarg0, obj_reg);
2431     __ mov(c_rarg1, lock_reg);
2432     __ mov(c_rarg2, r15_thread);
2433 
2434     // Not a leaf but we have last_Java_frame setup as we want
2435     // Force freeze slow path on ObjectMonitor::enter() for now which will fail with freeze_pinned_native.
2436     __ push_cont_fastpath();
2437     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2438     __ pop_cont_fastpath();
2439     restore_args(masm, total_c_args, c_arg, out_regs);
2440 
2441 #ifdef ASSERT
2442     { Label L;
2443     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2444     __ jcc(Assembler::equal, L);
2445     __ stop("no pending exception allowed on exit from monitorenter");
2446     __ bind(L);
2447     }
2448 #endif
2449     __ jmp(lock_done);
2450 
2451     // END Slow path lock
2452 
2453     // BEGIN Slow path unlock
2454     __ bind(slow_path_unlock);
2455 
2456     // If we haven't already saved the native result we must save it now as xmm registers
2457     // are still exposed.
2458     __ vzeroupper();
2459     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2460       save_native_result(masm, ret_type, stack_slots);
2461     }
2462 
2463     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2464 
2465     __ mov(c_rarg0, obj_reg);
2466     __ mov(c_rarg2, r15_thread);
2467     __ mov(r12, rsp); // remember sp
2468     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2469     __ andptr(rsp, -16); // align stack as required by ABI
2470 
2471     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2472     // NOTE that obj_reg == rbx currently
2473     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2474     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2475 
2476     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2477     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2478     __ mov(rsp, r12); // restore sp
2479     __ reinit_heapbase();
2480 #ifdef ASSERT
2481     {
2482       Label L;
2483       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2484       __ jcc(Assembler::equal, L);
2485       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2486       __ bind(L);
2487     }
2488 #endif /* ASSERT */
2489 
2490     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2491 
2492     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2493       restore_native_result(masm, ret_type, stack_slots);
2494     }
2495     __ jmp(unlock_done);
2496 
2497     // END Slow path unlock
2498 
2499   } // synchronized
2500 
2501   // SLOW PATH Reguard the stack if needed
2502 
2503   __ bind(reguard);
2504   __ vzeroupper();
2505   save_native_result(masm, ret_type, stack_slots);
2506   __ mov(r12, rsp); // remember sp
2507   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2508   __ andptr(rsp, -16); // align stack as required by ABI
2509   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2510   __ mov(rsp, r12); // restore sp
2511   __ reinit_heapbase();
2512   restore_native_result(masm, ret_type, stack_slots);
2513   // and continue
2514   __ jmp(reguard_done);
2515 
2516 
2517 
2518   __ flush();
2519 
2520   nmethod *nm = nmethod::new_native_nmethod(method,
2521                                             compile_id,
2522                                             masm->code(),
2523                                             vep_offset,
2524                                             frame_complete,
2525                                             stack_slots / VMRegImpl::slots_per_word,
2526                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2527                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2528                                             oop_maps);
2529 
2530   return nm;
2531 }
2532 
2533 // this function returns the adjust size (in number of words) to a c2i adapter
2534 // activation for use during deoptimization
2535 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2536   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2537 }
2538 
2539 
2540 uint SharedRuntime::out_preserve_stack_slots() {
2541   return 0;
2542 }
2543 
2544 
2545 // Number of stack slots between incoming argument block and the start of
2546 // a new frame.  The PROLOG must add this many slots to the stack.  The
2547 // EPILOG must remove this many slots.  amd64 needs two slots for
2548 // return address.
2549 uint SharedRuntime::in_preserve_stack_slots() {
2550   return 4 + 2 * VerifyStackAtCalls;
2551 }
2552 
2553 VMReg SharedRuntime::thread_register() {
2554   return r15_thread->as_VMReg();
2555 }
2556 
2557 //------------------------------generate_deopt_blob----------------------------
2558 void SharedRuntime::generate_deopt_blob() {
2559   // Allocate space for the code
2560   ResourceMark rm;
2561   // Setup code generation tools
2562   int pad = 0;
2563   if (UseAVX > 2) {
2564     pad += 1024;
2565   }
2566 #if INCLUDE_JVMCI
2567   if (EnableJVMCI) {
2568     pad += 512; // Increase the buffer size when compiling for JVMCI
2569   }
2570 #endif
2571   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2572   MacroAssembler* masm = new MacroAssembler(&buffer);
2573   int frame_size_in_words;
2574   OopMap* map = nullptr;
2575   OopMapSet *oop_maps = new OopMapSet();
2576 
2577   // -------------
2578   // This code enters when returning to a de-optimized nmethod.  A return
2579   // address has been pushed on the stack, and return values are in
2580   // registers.
2581   // If we are doing a normal deopt then we were called from the patched
2582   // nmethod from the point we returned to the nmethod. So the return
2583   // address on the stack is wrong by NativeCall::instruction_size
2584   // We will adjust the value so it looks like we have the original return
2585   // address on the stack (like when we eagerly deoptimized).
2586   // In the case of an exception pending when deoptimizing, we enter
2587   // with a return address on the stack that points after the call we patched
2588   // into the exception handler. We have the following register state from,
2589   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2590   //    rax: exception oop
2591   //    rbx: exception handler
2592   //    rdx: throwing pc
2593   // So in this case we simply jam rdx into the useless return address and
2594   // the stack looks just like we want.
2595   //
2596   // At this point we need to de-opt.  We save the argument return
2597   // registers.  We call the first C routine, fetch_unroll_info().  This
2598   // routine captures the return values and returns a structure which
2599   // describes the current frame size and the sizes of all replacement frames.
2600   // The current frame is compiled code and may contain many inlined
2601   // functions, each with their own JVM state.  We pop the current frame, then
2602   // push all the new frames.  Then we call the C routine unpack_frames() to
2603   // populate these frames.  Finally unpack_frames() returns us the new target
2604   // address.  Notice that callee-save registers are BLOWN here; they have
2605   // already been captured in the vframeArray at the time the return PC was
2606   // patched.
2607   address start = __ pc();
2608   Label cont;
2609 
2610   // Prolog for non exception case!
2611 
2612   // Save everything in sight.
2613   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2614 
2615   // Normal deoptimization.  Save exec mode for unpack_frames.
2616   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2617   __ jmp(cont);
2618 
2619   int reexecute_offset = __ pc() - start;
2620 #if INCLUDE_JVMCI && !defined(COMPILER1)
2621   if (EnableJVMCI && UseJVMCICompiler) {
2622     // JVMCI does not use this kind of deoptimization
2623     __ should_not_reach_here();
2624   }
2625 #endif
2626 
2627   // Reexecute case
2628   // return address is the pc describes what bci to do re-execute at
2629 
2630   // No need to update map as each call to save_live_registers will produce identical oopmap
2631   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2632 
2633   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2634   __ jmp(cont);
2635 
2636 #if INCLUDE_JVMCI
2637   Label after_fetch_unroll_info_call;
2638   int implicit_exception_uncommon_trap_offset = 0;
2639   int uncommon_trap_offset = 0;
2640 
2641   if (EnableJVMCI) {
2642     implicit_exception_uncommon_trap_offset = __ pc() - start;
2643 
2644     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2645     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2646 
2647     uncommon_trap_offset = __ pc() - start;
2648 
2649     // Save everything in sight.
2650     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2651     // fetch_unroll_info needs to call last_java_frame()
2652     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2653 
2654     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2655     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2656 
2657     __ movl(r14, Deoptimization::Unpack_reexecute);
2658     __ mov(c_rarg0, r15_thread);
2659     __ movl(c_rarg2, r14); // exec mode
2660     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2661     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2662 
2663     __ reset_last_Java_frame(false);
2664 
2665     __ jmp(after_fetch_unroll_info_call);
2666   } // EnableJVMCI
2667 #endif // INCLUDE_JVMCI
2668 
2669   int exception_offset = __ pc() - start;
2670 
2671   // Prolog for exception case
2672 
2673   // all registers are dead at this entry point, except for rax, and
2674   // rdx which contain the exception oop and exception pc
2675   // respectively.  Set them in TLS and fall thru to the
2676   // unpack_with_exception_in_tls entry point.
2677 
2678   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2679   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2680 
2681   int exception_in_tls_offset = __ pc() - start;
2682 
2683   // new implementation because exception oop is now passed in JavaThread
2684 
2685   // Prolog for exception case
2686   // All registers must be preserved because they might be used by LinearScan
2687   // Exceptiop oop and throwing PC are passed in JavaThread
2688   // tos: stack at point of call to method that threw the exception (i.e. only
2689   // args are on the stack, no return address)
2690 
2691   // make room on stack for the return address
2692   // It will be patched later with the throwing pc. The correct value is not
2693   // available now because loading it from memory would destroy registers.
2694   __ push(0);
2695 
2696   // Save everything in sight.
2697   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2698 
2699   // Now it is safe to overwrite any register
2700 
2701   // Deopt during an exception.  Save exec mode for unpack_frames.
2702   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2703 
2704   // load throwing pc from JavaThread and patch it as the return address
2705   // of the current frame. Then clear the field in JavaThread
2706 
2707   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2708   __ movptr(Address(rbp, wordSize), rdx);
2709   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2710 
2711 #ifdef ASSERT
2712   // verify that there is really an exception oop in JavaThread
2713   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2714   __ verify_oop(rax);
2715 
2716   // verify that there is no pending exception
2717   Label no_pending_exception;
2718   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2719   __ testptr(rax, rax);
2720   __ jcc(Assembler::zero, no_pending_exception);
2721   __ stop("must not have pending exception here");
2722   __ bind(no_pending_exception);
2723 #endif
2724 
2725   __ bind(cont);
2726 
2727   // Call C code.  Need thread and this frame, but NOT official VM entry
2728   // crud.  We cannot block on this call, no GC can happen.
2729   //
2730   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2731 
2732   // fetch_unroll_info needs to call last_java_frame().
2733 
2734   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2735 #ifdef ASSERT
2736   { Label L;
2737     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2738     __ jcc(Assembler::equal, L);
2739     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2740     __ bind(L);
2741   }
2742 #endif // ASSERT
2743   __ mov(c_rarg0, r15_thread);
2744   __ movl(c_rarg1, r14); // exec_mode
2745   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2746 
2747   // Need to have an oopmap that tells fetch_unroll_info where to
2748   // find any register it might need.
2749   oop_maps->add_gc_map(__ pc() - start, map);
2750 
2751   __ reset_last_Java_frame(false);
2752 
2753 #if INCLUDE_JVMCI
2754   if (EnableJVMCI) {
2755     __ bind(after_fetch_unroll_info_call);
2756   }
2757 #endif
2758 
2759   // Load UnrollBlock* into rdi
2760   __ mov(rdi, rax);
2761 
2762   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2763    Label noException;
2764   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2765   __ jcc(Assembler::notEqual, noException);
2766   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2767   // QQQ this is useless it was null above
2768   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2769   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2770   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2771 
2772   __ verify_oop(rax);
2773 
2774   // Overwrite the result registers with the exception results.
2775   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2776   // I think this is useless
2777   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2778 
2779   __ bind(noException);
2780 
2781   // Only register save data is on the stack.
2782   // Now restore the result registers.  Everything else is either dead
2783   // or captured in the vframeArray.
2784   RegisterSaver::restore_result_registers(masm);
2785 
2786   // All of the register save area has been popped of the stack. Only the
2787   // return address remains.
2788 
2789   // Pop all the frames we must move/replace.
2790   //
2791   // Frame picture (youngest to oldest)
2792   // 1: self-frame (no frame link)
2793   // 2: deopting frame  (no frame link)
2794   // 3: caller of deopting frame (could be compiled/interpreted).
2795   //
2796   // Note: by leaving the return address of self-frame on the stack
2797   // and using the size of frame 2 to adjust the stack
2798   // when we are done the return to frame 3 will still be on the stack.
2799 
2800   // Pop deoptimized frame
2801   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2802   __ addptr(rsp, rcx);
2803 
2804   // rsp should be pointing at the return address to the caller (3)
2805 
2806   // Pick up the initial fp we should save
2807   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2808   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2809 
2810 #ifdef ASSERT
2811   // Compilers generate code that bang the stack by as much as the
2812   // interpreter would need. So this stack banging should never
2813   // trigger a fault. Verify that it does not on non product builds.
2814   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2815   __ bang_stack_size(rbx, rcx);
2816 #endif
2817 
2818   // Load address of array of frame pcs into rcx
2819   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2820 
2821   // Trash the old pc
2822   __ addptr(rsp, wordSize);
2823 
2824   // Load address of array of frame sizes into rsi
2825   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2826 
2827   // Load counter into rdx
2828   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2829 
2830   // Now adjust the caller's stack to make up for the extra locals
2831   // but record the original sp so that we can save it in the skeletal interpreter
2832   // frame and the stack walking of interpreter_sender will get the unextended sp
2833   // value and not the "real" sp value.
2834 
2835   const Register sender_sp = r8;
2836 
2837   __ mov(sender_sp, rsp);
2838   __ movl(rbx, Address(rdi,
2839                        Deoptimization::UnrollBlock::
2840                        caller_adjustment_offset()));
2841   __ subptr(rsp, rbx);
2842 
2843   // Push interpreter frames in a loop
2844   Label loop;
2845   __ bind(loop);
2846   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2847   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2848   __ pushptr(Address(rcx, 0));          // Save return address
2849   __ enter();                           // Save old & set new ebp
2850   __ subptr(rsp, rbx);                  // Prolog
2851   // This value is corrected by layout_activation_impl
2852   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2853   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2854   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2855   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2856   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2857   __ decrementl(rdx);                   // Decrement counter
2858   __ jcc(Assembler::notZero, loop);
2859   __ pushptr(Address(rcx, 0));          // Save final return address
2860 
2861   // Re-push self-frame
2862   __ enter();                           // Save old & set new ebp
2863 
2864   // Allocate a full sized register save area.
2865   // Return address and rbp are in place, so we allocate two less words.
2866   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2867 
2868   // Restore frame locals after moving the frame
2869   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2870   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2871 
2872   // Call C code.  Need thread but NOT official VM entry
2873   // crud.  We cannot block on this call, no GC can happen.  Call should
2874   // restore return values to their stack-slots with the new SP.
2875   //
2876   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2877 
2878   // Use rbp because the frames look interpreted now
2879   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2880   // Don't need the precise return PC here, just precise enough to point into this code blob.
2881   address the_pc = __ pc();
2882   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2883 
2884   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2885   __ mov(c_rarg0, r15_thread);
2886   __ movl(c_rarg1, r14); // second arg: exec_mode
2887   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2888   // Revert SP alignment after call since we're going to do some SP relative addressing below
2889   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2890 
2891   // Set an oopmap for the call site
2892   // Use the same PC we used for the last java frame
2893   oop_maps->add_gc_map(the_pc - start,
2894                        new OopMap( frame_size_in_words, 0 ));
2895 
2896   // Clear fp AND pc
2897   __ reset_last_Java_frame(true);
2898 
2899   // Collect return values
2900   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2901   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2902   // I think this is useless (throwing pc?)
2903   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2904 
2905   // Pop self-frame.
2906   __ leave();                           // Epilog
2907 
2908   // Jump to interpreter
2909   __ ret(0);
2910 
2911   // Make sure all code is generated
2912   masm->flush();
2913 
2914   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2915   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2916 #if INCLUDE_JVMCI
2917   if (EnableJVMCI) {
2918     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2919     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2920   }
2921 #endif
2922 }
2923 
2924 #ifdef COMPILER2
2925 //------------------------------generate_uncommon_trap_blob--------------------
2926 void SharedRuntime::generate_uncommon_trap_blob() {
2927   // Allocate space for the code
2928   ResourceMark rm;
2929   // Setup code generation tools
2930   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2931   MacroAssembler* masm = new MacroAssembler(&buffer);
2932 
2933   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2934 
2935   address start = __ pc();
2936 
2937   if (UseRTMLocking) {
2938     // Abort RTM transaction before possible nmethod deoptimization.
2939     __ xabort(0);
2940   }
2941 
2942   // Push self-frame.  We get here with a return address on the
2943   // stack, so rsp is 8-byte aligned until we allocate our frame.
2944   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2945 
2946   // No callee saved registers. rbp is assumed implicitly saved
2947   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2948 
2949   // compiler left unloaded_class_index in j_rarg0 move to where the
2950   // runtime expects it.
2951   __ movl(c_rarg1, j_rarg0);
2952 
2953   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2954 
2955   // Call C code.  Need thread but NOT official VM entry
2956   // crud.  We cannot block on this call, no GC can happen.  Call should
2957   // capture callee-saved registers as well as return values.
2958   // Thread is in rdi already.
2959   //
2960   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2961 
2962   __ mov(c_rarg0, r15_thread);
2963   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2964   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2965 
2966   // Set an oopmap for the call site
2967   OopMapSet* oop_maps = new OopMapSet();
2968   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2969 
2970   // location of rbp is known implicitly by the frame sender code
2971 
2972   oop_maps->add_gc_map(__ pc() - start, map);
2973 
2974   __ reset_last_Java_frame(false);
2975 
2976   // Load UnrollBlock* into rdi
2977   __ mov(rdi, rax);
2978 
2979 #ifdef ASSERT
2980   { Label L;
2981     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
2982               Deoptimization::Unpack_uncommon_trap);
2983     __ jcc(Assembler::equal, L);
2984     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
2985     __ bind(L);
2986   }
2987 #endif
2988 
2989   // Pop all the frames we must move/replace.
2990   //
2991   // Frame picture (youngest to oldest)
2992   // 1: self-frame (no frame link)
2993   // 2: deopting frame  (no frame link)
2994   // 3: caller of deopting frame (could be compiled/interpreted).
2995 
2996   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2997   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2998 
2999   // Pop deoptimized frame (int)
3000   __ movl(rcx, Address(rdi,
3001                        Deoptimization::UnrollBlock::
3002                        size_of_deoptimized_frame_offset()));
3003   __ addptr(rsp, rcx);
3004 
3005   // rsp should be pointing at the return address to the caller (3)
3006 
3007   // Pick up the initial fp we should save
3008   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3009   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3010 
3011 #ifdef ASSERT
3012   // Compilers generate code that bang the stack by as much as the
3013   // interpreter would need. So this stack banging should never
3014   // trigger a fault. Verify that it does not on non product builds.
3015   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3016   __ bang_stack_size(rbx, rcx);
3017 #endif
3018 
3019   // Load address of array of frame pcs into rcx (address*)
3020   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3021 
3022   // Trash the return pc
3023   __ addptr(rsp, wordSize);
3024 
3025   // Load address of array of frame sizes into rsi (intptr_t*)
3026   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
3027 
3028   // Counter
3029   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
3030 
3031   // Now adjust the caller's stack to make up for the extra locals but
3032   // record the original sp so that we can save it in the skeletal
3033   // interpreter frame and the stack walking of interpreter_sender
3034   // will get the unextended sp value and not the "real" sp value.
3035 
3036   const Register sender_sp = r8;
3037 
3038   __ mov(sender_sp, rsp);
3039   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
3040   __ subptr(rsp, rbx);
3041 
3042   // Push interpreter frames in a loop
3043   Label loop;
3044   __ bind(loop);
3045   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3046   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3047   __ pushptr(Address(rcx, 0));     // Save return address
3048   __ enter();                      // Save old & set new rbp
3049   __ subptr(rsp, rbx);             // Prolog
3050   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3051             sender_sp);            // Make it walkable
3052   // This value is corrected by layout_activation_impl
3053   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3054   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3055   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3056   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3057   __ decrementl(rdx);              // Decrement counter
3058   __ jcc(Assembler::notZero, loop);
3059   __ pushptr(Address(rcx, 0));     // Save final return address
3060 
3061   // Re-push self-frame
3062   __ enter();                 // Save old & set new rbp
3063   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3064                               // Prolog
3065 
3066   // Use rbp because the frames look interpreted now
3067   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3068   // Don't need the precise return PC here, just precise enough to point into this code blob.
3069   address the_pc = __ pc();
3070   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3071 
3072   // Call C code.  Need thread but NOT official VM entry
3073   // crud.  We cannot block on this call, no GC can happen.  Call should
3074   // restore return values to their stack-slots with the new SP.
3075   // Thread is in rdi already.
3076   //
3077   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3078 
3079   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3080   __ mov(c_rarg0, r15_thread);
3081   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3082   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3083 
3084   // Set an oopmap for the call site
3085   // Use the same PC we used for the last java frame
3086   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3087 
3088   // Clear fp AND pc
3089   __ reset_last_Java_frame(true);
3090 
3091   // Pop self-frame.
3092   __ leave();                 // Epilog
3093 
3094   // Jump to interpreter
3095   __ ret(0);
3096 
3097   // Make sure all code is generated
3098   masm->flush();
3099 
3100   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3101                                                  SimpleRuntimeFrame::framesize >> 1);
3102 }
3103 #endif // COMPILER2
3104 
3105 //------------------------------generate_handler_blob------
3106 //
3107 // Generate a special Compile2Runtime blob that saves all registers,
3108 // and setup oopmap.
3109 //
3110 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3111   assert(StubRoutines::forward_exception_entry() != nullptr,
3112          "must be generated before");
3113 
3114   ResourceMark rm;
3115   OopMapSet *oop_maps = new OopMapSet();
3116   OopMap* map;
3117 
3118   // Allocate space for the code.  Setup code generation tools.
3119   CodeBuffer buffer("handler_blob", 2048, 1024);
3120   MacroAssembler* masm = new MacroAssembler(&buffer);
3121 
3122   address start   = __ pc();
3123   address call_pc = nullptr;
3124   int frame_size_in_words;
3125   bool cause_return = (poll_type == POLL_AT_RETURN);
3126   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3127 
3128   if (UseRTMLocking) {
3129     // Abort RTM transaction before calling runtime
3130     // because critical section will be large and will be
3131     // aborted anyway. Also nmethod could be deoptimized.
3132     __ xabort(0);
3133   }
3134 
3135   // Make room for return address (or push it again)
3136   if (!cause_return) {
3137     __ push(rbx);
3138   }
3139 
3140   // Save registers, fpu state, and flags
3141   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3142 
3143   // The following is basically a call_VM.  However, we need the precise
3144   // address of the call in order to generate an oopmap. Hence, we do all the
3145   // work ourselves.
3146 
3147   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3148 
3149   // The return address must always be correct so that frame constructor never
3150   // sees an invalid pc.
3151 
3152   if (!cause_return) {
3153     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3154     // Additionally, rbx is a callee saved register and we can look at it later to determine
3155     // if someone changed the return address for us!
3156     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3157     __ movptr(Address(rbp, wordSize), rbx);
3158   }
3159 
3160   // Do the call
3161   __ mov(c_rarg0, r15_thread);
3162   __ call(RuntimeAddress(call_ptr));
3163 
3164   // Set an oopmap for the call site.  This oopmap will map all
3165   // oop-registers and debug-info registers as callee-saved.  This
3166   // will allow deoptimization at this safepoint to find all possible
3167   // debug-info recordings, as well as let GC find all oops.
3168 
3169   oop_maps->add_gc_map( __ pc() - start, map);
3170 
3171   Label noException;
3172 
3173   __ reset_last_Java_frame(false);
3174 
3175   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3176   __ jcc(Assembler::equal, noException);
3177 
3178   // Exception pending
3179 
3180   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3181 
3182   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3183 
3184   // No exception case
3185   __ bind(noException);
3186 
3187   Label no_adjust;
3188 #ifdef ASSERT
3189   Label bail;
3190 #endif
3191   if (!cause_return) {
3192     Label no_prefix, not_special;
3193 
3194     // If our stashed return pc was modified by the runtime we avoid touching it
3195     __ cmpptr(rbx, Address(rbp, wordSize));
3196     __ jccb(Assembler::notEqual, no_adjust);
3197 
3198     // Skip over the poll instruction.
3199     // See NativeInstruction::is_safepoint_poll()
3200     // Possible encodings:
3201     //      85 00       test   %eax,(%rax)
3202     //      85 01       test   %eax,(%rcx)
3203     //      85 02       test   %eax,(%rdx)
3204     //      85 03       test   %eax,(%rbx)
3205     //      85 06       test   %eax,(%rsi)
3206     //      85 07       test   %eax,(%rdi)
3207     //
3208     //   41 85 00       test   %eax,(%r8)
3209     //   41 85 01       test   %eax,(%r9)
3210     //   41 85 02       test   %eax,(%r10)
3211     //   41 85 03       test   %eax,(%r11)
3212     //   41 85 06       test   %eax,(%r14)
3213     //   41 85 07       test   %eax,(%r15)
3214     //
3215     //      85 04 24    test   %eax,(%rsp)
3216     //   41 85 04 24    test   %eax,(%r12)
3217     //      85 45 00    test   %eax,0x0(%rbp)
3218     //   41 85 45 00    test   %eax,0x0(%r13)
3219 
3220     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3221     __ jcc(Assembler::notEqual, no_prefix);
3222     __ addptr(rbx, 1);
3223     __ bind(no_prefix);
3224 #ifdef ASSERT
3225     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3226 #endif
3227     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3228     // r12/rsp 0x04
3229     // r13/rbp 0x05
3230     __ movzbq(rcx, Address(rbx, 1));
3231     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3232     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3233     __ cmpptr(rcx, 1);
3234     __ jcc(Assembler::above, not_special);
3235     __ addptr(rbx, 1);
3236     __ bind(not_special);
3237 #ifdef ASSERT
3238     // Verify the correct encoding of the poll we're about to skip.
3239     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3240     __ jcc(Assembler::notEqual, bail);
3241     // Mask out the modrm bits
3242     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3243     // rax encodes to 0, so if the bits are nonzero it's incorrect
3244     __ jcc(Assembler::notZero, bail);
3245 #endif
3246     // Adjust return pc forward to step over the safepoint poll instruction
3247     __ addptr(rbx, 2);
3248     __ movptr(Address(rbp, wordSize), rbx);
3249   }
3250 
3251   __ bind(no_adjust);
3252   // Normal exit, restore registers and exit.
3253   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3254   __ ret(0);
3255 
3256 #ifdef ASSERT
3257   __ bind(bail);
3258   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3259 #endif
3260 
3261   // Make sure all code is generated
3262   masm->flush();
3263 
3264   // Fill-out other meta info
3265   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3266 }
3267 
3268 //
3269 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3270 //
3271 // Generate a stub that calls into vm to find out the proper destination
3272 // of a java call. All the argument registers are live at this point
3273 // but since this is generic code we don't know what they are and the caller
3274 // must do any gc of the args.
3275 //
3276 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3277   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3278 
3279   // allocate space for the code
3280   ResourceMark rm;
3281 
3282   CodeBuffer buffer(name, 1200, 512);
3283   MacroAssembler* masm = new MacroAssembler(&buffer);
3284 
3285   int frame_size_in_words;
3286 
3287   OopMapSet *oop_maps = new OopMapSet();
3288   OopMap* map = nullptr;
3289 
3290   int start = __ offset();
3291 
3292   // No need to save vector registers since they are caller-saved anyway.
3293   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3294 
3295   int frame_complete = __ offset();
3296 
3297   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3298 
3299   __ mov(c_rarg0, r15_thread);
3300 
3301   __ call(RuntimeAddress(destination));
3302 
3303 
3304   // Set an oopmap for the call site.
3305   // We need this not only for callee-saved registers, but also for volatile
3306   // registers that the compiler might be keeping live across a safepoint.
3307 
3308   oop_maps->add_gc_map( __ offset() - start, map);
3309 
3310   // rax contains the address we are going to jump to assuming no exception got installed
3311 
3312   // clear last_Java_sp
3313   __ reset_last_Java_frame(false);
3314   // check for pending exceptions
3315   Label pending;
3316   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3317   __ jcc(Assembler::notEqual, pending);
3318 
3319   // get the returned Method*
3320   __ get_vm_result_2(rbx, r15_thread);
3321   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3322 
3323   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3324 
3325   RegisterSaver::restore_live_registers(masm);
3326 
3327   // We are back to the original state on entry and ready to go.
3328 
3329   __ jmp(rax);
3330 
3331   // Pending exception after the safepoint
3332 
3333   __ bind(pending);
3334 
3335   RegisterSaver::restore_live_registers(masm);
3336 
3337   // exception pending => remove activation and forward to exception handler
3338 
3339   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3340 
3341   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3342   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3343 
3344   // -------------
3345   // make sure all code is generated
3346   masm->flush();
3347 
3348   // return the  blob
3349   // frame_size_words or bytes??
3350   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3351 }
3352 
3353 //------------------------------Montgomery multiplication------------------------
3354 //
3355 
3356 #ifndef _WINDOWS
3357 
3358 // Subtract 0:b from carry:a.  Return carry.
3359 static julong
3360 sub(julong a[], julong b[], julong carry, long len) {
3361   long long i = 0, cnt = len;
3362   julong tmp;
3363   asm volatile("clc; "
3364                "0: ; "
3365                "mov (%[b], %[i], 8), %[tmp]; "
3366                "sbb %[tmp], (%[a], %[i], 8); "
3367                "inc %[i]; dec %[cnt]; "
3368                "jne 0b; "
3369                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3370                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3371                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3372                : "memory");
3373   return tmp;
3374 }
3375 
3376 // Multiply (unsigned) Long A by Long B, accumulating the double-
3377 // length result into the accumulator formed of T0, T1, and T2.
3378 #define MACC(A, B, T0, T1, T2)                                  \
3379 do {                                                            \
3380   unsigned long hi, lo;                                         \
3381   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3382            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3383            : "r"(A), "a"(B) : "cc");                            \
3384  } while(0)
3385 
3386 // As above, but add twice the double-length result into the
3387 // accumulator.
3388 #define MACC2(A, B, T0, T1, T2)                                 \
3389 do {                                                            \
3390   unsigned long hi, lo;                                         \
3391   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3392            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3393            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3394            : "r"(A), "a"(B) : "cc");                            \
3395  } while(0)
3396 
3397 #else //_WINDOWS
3398 
3399 static julong
3400 sub(julong a[], julong b[], julong carry, long len) {
3401   long i;
3402   julong tmp;
3403   unsigned char c = 1;
3404   for (i = 0; i < len; i++) {
3405     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3406     a[i] = tmp;
3407   }
3408   c = _addcarry_u64(c, carry, ~0, &tmp);
3409   return tmp;
3410 }
3411 
3412 // Multiply (unsigned) Long A by Long B, accumulating the double-
3413 // length result into the accumulator formed of T0, T1, and T2.
3414 #define MACC(A, B, T0, T1, T2)                          \
3415 do {                                                    \
3416   julong hi, lo;                            \
3417   lo = _umul128(A, B, &hi);                             \
3418   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3419   c = _addcarry_u64(c, hi, T1, &T1);                    \
3420   _addcarry_u64(c, T2, 0, &T2);                         \
3421  } while(0)
3422 
3423 // As above, but add twice the double-length result into the
3424 // accumulator.
3425 #define MACC2(A, B, T0, T1, T2)                         \
3426 do {                                                    \
3427   julong hi, lo;                            \
3428   lo = _umul128(A, B, &hi);                             \
3429   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3430   c = _addcarry_u64(c, hi, T1, &T1);                    \
3431   _addcarry_u64(c, T2, 0, &T2);                         \
3432   c = _addcarry_u64(0, lo, T0, &T0);                    \
3433   c = _addcarry_u64(c, hi, T1, &T1);                    \
3434   _addcarry_u64(c, T2, 0, &T2);                         \
3435  } while(0)
3436 
3437 #endif //_WINDOWS
3438 
3439 // Fast Montgomery multiplication.  The derivation of the algorithm is
3440 // in  A Cryptographic Library for the Motorola DSP56000,
3441 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3442 
3443 static void NOINLINE
3444 montgomery_multiply(julong a[], julong b[], julong n[],
3445                     julong m[], julong inv, int len) {
3446   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3447   int i;
3448 
3449   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3450 
3451   for (i = 0; i < len; i++) {
3452     int j;
3453     for (j = 0; j < i; j++) {
3454       MACC(a[j], b[i-j], t0, t1, t2);
3455       MACC(m[j], n[i-j], t0, t1, t2);
3456     }
3457     MACC(a[i], b[0], t0, t1, t2);
3458     m[i] = t0 * inv;
3459     MACC(m[i], n[0], t0, t1, t2);
3460 
3461     assert(t0 == 0, "broken Montgomery multiply");
3462 
3463     t0 = t1; t1 = t2; t2 = 0;
3464   }
3465 
3466   for (i = len; i < 2*len; i++) {
3467     int j;
3468     for (j = i-len+1; j < len; j++) {
3469       MACC(a[j], b[i-j], t0, t1, t2);
3470       MACC(m[j], n[i-j], t0, t1, t2);
3471     }
3472     m[i-len] = t0;
3473     t0 = t1; t1 = t2; t2 = 0;
3474   }
3475 
3476   while (t0)
3477     t0 = sub(m, n, t0, len);
3478 }
3479 
3480 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3481 // multiplies so it should be up to 25% faster than Montgomery
3482 // multiplication.  However, its loop control is more complex and it
3483 // may actually run slower on some machines.
3484 
3485 static void NOINLINE
3486 montgomery_square(julong a[], julong n[],
3487                   julong m[], julong inv, int len) {
3488   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3489   int i;
3490 
3491   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3492 
3493   for (i = 0; i < len; i++) {
3494     int j;
3495     int end = (i+1)/2;
3496     for (j = 0; j < end; j++) {
3497       MACC2(a[j], a[i-j], t0, t1, t2);
3498       MACC(m[j], n[i-j], t0, t1, t2);
3499     }
3500     if ((i & 1) == 0) {
3501       MACC(a[j], a[j], t0, t1, t2);
3502     }
3503     for (; j < i; j++) {
3504       MACC(m[j], n[i-j], t0, t1, t2);
3505     }
3506     m[i] = t0 * inv;
3507     MACC(m[i], n[0], t0, t1, t2);
3508 
3509     assert(t0 == 0, "broken Montgomery square");
3510 
3511     t0 = t1; t1 = t2; t2 = 0;
3512   }
3513 
3514   for (i = len; i < 2*len; i++) {
3515     int start = i-len+1;
3516     int end = start + (len - start)/2;
3517     int j;
3518     for (j = start; j < end; j++) {
3519       MACC2(a[j], a[i-j], t0, t1, t2);
3520       MACC(m[j], n[i-j], t0, t1, t2);
3521     }
3522     if ((i & 1) == 0) {
3523       MACC(a[j], a[j], t0, t1, t2);
3524     }
3525     for (; j < len; j++) {
3526       MACC(m[j], n[i-j], t0, t1, t2);
3527     }
3528     m[i-len] = t0;
3529     t0 = t1; t1 = t2; t2 = 0;
3530   }
3531 
3532   while (t0)
3533     t0 = sub(m, n, t0, len);
3534 }
3535 
3536 // Swap words in a longword.
3537 static julong swap(julong x) {
3538   return (x << 32) | (x >> 32);
3539 }
3540 
3541 // Copy len longwords from s to d, word-swapping as we go.  The
3542 // destination array is reversed.
3543 static void reverse_words(julong *s, julong *d, int len) {
3544   d += len;
3545   while(len-- > 0) {
3546     d--;
3547     *d = swap(*s);
3548     s++;
3549   }
3550 }
3551 
3552 // The threshold at which squaring is advantageous was determined
3553 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3554 #define MONTGOMERY_SQUARING_THRESHOLD 64
3555 
3556 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3557                                         jint len, jlong inv,
3558                                         jint *m_ints) {
3559   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3560   int longwords = len/2;
3561 
3562   // Make very sure we don't use so much space that the stack might
3563   // overflow.  512 jints corresponds to an 16384-bit integer and
3564   // will use here a total of 8k bytes of stack space.
3565   int divisor = sizeof(julong) * 4;
3566   guarantee(longwords <= 8192 / divisor, "must be");
3567   int total_allocation = longwords * sizeof (julong) * 4;
3568   julong *scratch = (julong *)alloca(total_allocation);
3569 
3570   // Local scratch arrays
3571   julong
3572     *a = scratch + 0 * longwords,
3573     *b = scratch + 1 * longwords,
3574     *n = scratch + 2 * longwords,
3575     *m = scratch + 3 * longwords;
3576 
3577   reverse_words((julong *)a_ints, a, longwords);
3578   reverse_words((julong *)b_ints, b, longwords);
3579   reverse_words((julong *)n_ints, n, longwords);
3580 
3581   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3582 
3583   reverse_words(m, (julong *)m_ints, longwords);
3584 }
3585 
3586 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3587                                       jint len, jlong inv,
3588                                       jint *m_ints) {
3589   assert(len % 2 == 0, "array length in montgomery_square must be even");
3590   int longwords = len/2;
3591 
3592   // Make very sure we don't use so much space that the stack might
3593   // overflow.  512 jints corresponds to an 16384-bit integer and
3594   // will use here a total of 6k bytes of stack space.
3595   int divisor = sizeof(julong) * 3;
3596   guarantee(longwords <= (8192 / divisor), "must be");
3597   int total_allocation = longwords * sizeof (julong) * 3;
3598   julong *scratch = (julong *)alloca(total_allocation);
3599 
3600   // Local scratch arrays
3601   julong
3602     *a = scratch + 0 * longwords,
3603     *n = scratch + 1 * longwords,
3604     *m = scratch + 2 * longwords;
3605 
3606   reverse_words((julong *)a_ints, a, longwords);
3607   reverse_words((julong *)n_ints, n, longwords);
3608 
3609   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3610     ::montgomery_square(a, n, m, (julong)inv, longwords);
3611   } else {
3612     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3613   }
3614 
3615   reverse_words(m, (julong *)m_ints, longwords);
3616 }
3617 
3618 #ifdef COMPILER2
3619 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3620 //
3621 //------------------------------generate_exception_blob---------------------------
3622 // creates exception blob at the end
3623 // Using exception blob, this code is jumped from a compiled method.
3624 // (see emit_exception_handler in x86_64.ad file)
3625 //
3626 // Given an exception pc at a call we call into the runtime for the
3627 // handler in this method. This handler might merely restore state
3628 // (i.e. callee save registers) unwind the frame and jump to the
3629 // exception handler for the nmethod if there is no Java level handler
3630 // for the nmethod.
3631 //
3632 // This code is entered with a jmp.
3633 //
3634 // Arguments:
3635 //   rax: exception oop
3636 //   rdx: exception pc
3637 //
3638 // Results:
3639 //   rax: exception oop
3640 //   rdx: exception pc in caller or ???
3641 //   destination: exception handler of caller
3642 //
3643 // Note: the exception pc MUST be at a call (precise debug information)
3644 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3645 //
3646 
3647 void OptoRuntime::generate_exception_blob() {
3648   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3649   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3650   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3651 
3652   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3653 
3654   // Allocate space for the code
3655   ResourceMark rm;
3656   // Setup code generation tools
3657   CodeBuffer buffer("exception_blob", 2048, 1024);
3658   MacroAssembler* masm = new MacroAssembler(&buffer);
3659 
3660 
3661   address start = __ pc();
3662 
3663   // Exception pc is 'return address' for stack walker
3664   __ push(rdx);
3665   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3666 
3667   // Save callee-saved registers.  See x86_64.ad.
3668 
3669   // rbp is an implicitly saved callee saved register (i.e., the calling
3670   // convention will save/restore it in the prolog/epilog). Other than that
3671   // there are no callee save registers now that adapter frames are gone.
3672 
3673   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3674 
3675   // Store exception in Thread object. We cannot pass any arguments to the
3676   // handle_exception call, since we do not want to make any assumption
3677   // about the size of the frame where the exception happened in.
3678   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3679   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3680   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3681 
3682   // This call does all the hard work.  It checks if an exception handler
3683   // exists in the method.
3684   // If so, it returns the handler address.
3685   // If not, it prepares for stack-unwinding, restoring the callee-save
3686   // registers of the frame being removed.
3687   //
3688   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3689 
3690   // At a method handle call, the stack may not be properly aligned
3691   // when returning with an exception.
3692   address the_pc = __ pc();
3693   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3694   __ mov(c_rarg0, r15_thread);
3695   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3696   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3697 
3698   // Set an oopmap for the call site.  This oopmap will only be used if we
3699   // are unwinding the stack.  Hence, all locations will be dead.
3700   // Callee-saved registers will be the same as the frame above (i.e.,
3701   // handle_exception_stub), since they were restored when we got the
3702   // exception.
3703 
3704   OopMapSet* oop_maps = new OopMapSet();
3705 
3706   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3707 
3708   __ reset_last_Java_frame(false);
3709 
3710   // Restore callee-saved registers
3711 
3712   // rbp is an implicitly saved callee-saved register (i.e., the calling
3713   // convention will save restore it in prolog/epilog) Other than that
3714   // there are no callee save registers now that adapter frames are gone.
3715 
3716   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3717 
3718   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3719   __ pop(rdx);                  // No need for exception pc anymore
3720 
3721   // rax: exception handler
3722 
3723   // We have a handler in rax (could be deopt blob).
3724   __ mov(r8, rax);
3725 
3726   // Get the exception oop
3727   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3728   // Get the exception pc in case we are deoptimized
3729   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3730 #ifdef ASSERT
3731   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3732   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3733 #endif
3734   // Clear the exception oop so GC no longer processes it as a root.
3735   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3736 
3737   // rax: exception oop
3738   // r8:  exception handler
3739   // rdx: exception pc
3740   // Jump to handler
3741 
3742   __ jmp(r8);
3743 
3744   // Make sure all code is generated
3745   masm->flush();
3746 
3747   // Set exception blob
3748   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3749 }
3750 #endif // COMPILER2