1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/vframeArray.hpp"
  56 #include "runtime/vm_version.hpp"
  57 #include "utilities/align.hpp"
  58 #include "utilities/checkedCast.hpp"
  59 #include "utilities/formatBuffer.hpp"
  60 #include "vmreg_x86.inline.hpp"
  61 #ifdef COMPILER1
  62 #include "c1/c1_Runtime1.hpp"
  63 #endif
  64 #ifdef COMPILER2
  65 #include "opto/runtime.hpp"
  66 #endif
  67 #if INCLUDE_JVMCI
  68 #include "jvmci/jvmciJavaClasses.hpp"
  69 #endif
  70 
  71 #define __ masm->
  72 
  73 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  74 
  75 class SimpleRuntimeFrame {
  76 
  77   public:
  78 
  79   // Most of the runtime stubs have this simple frame layout.
  80   // This class exists to make the layout shared in one place.
  81   // Offsets are for compiler stack slots, which are jints.
  82   enum layout {
  83     // The frame sender code expects that rbp will be in the "natural" place and
  84     // will override any oopMap setting for it. We must therefore force the layout
  85     // so that it agrees with the frame sender code.
  86     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  87     rbp_off2,
  88     return_off, return_off2,
  89     framesize
  90   };
  91 };
  92 
  93 class RegisterSaver {
  94   // Capture info about frame layout.  Layout offsets are in jint
  95   // units because compiler frame slots are jints.
  96 #define XSAVE_AREA_BEGIN 160
  97 #define XSAVE_AREA_YMM_BEGIN 576
  98 #define XSAVE_AREA_OPMASK_BEGIN 1088
  99 #define XSAVE_AREA_ZMM_BEGIN 1152
 100 #define XSAVE_AREA_UPPERBANK 1664
 101 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 102 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 103 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 104 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 105 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 106   enum layout {
 107     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 108     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 109     DEF_XMM_OFFS(0),
 110     DEF_XMM_OFFS(1),
 111     // 2..15 are implied in range usage
 112     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 113     DEF_YMM_OFFS(0),
 114     DEF_YMM_OFFS(1),
 115     // 2..15 are implied in range usage
 116     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 117     DEF_OPMASK_OFFS(0),
 118     DEF_OPMASK_OFFS(1),
 119     // 2..7 are implied in range usage
 120     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 121     DEF_ZMM_OFFS(0),
 122     DEF_ZMM_OFFS(1),
 123     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 124     DEF_ZMM_UPPER_OFFS(16),
 125     DEF_ZMM_UPPER_OFFS(17),
 126     // 18..31 are implied in range usage
 127     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 128     fpu_stateH_end,
 129     r15_off, r15H_off,
 130     r14_off, r14H_off,
 131     r13_off, r13H_off,
 132     r12_off, r12H_off,
 133     r11_off, r11H_off,
 134     r10_off, r10H_off,
 135     r9_off,  r9H_off,
 136     r8_off,  r8H_off,
 137     rdi_off, rdiH_off,
 138     rsi_off, rsiH_off,
 139     ignore_off, ignoreH_off,  // extra copy of rbp
 140     rsp_off, rspH_off,
 141     rbx_off, rbxH_off,
 142     rdx_off, rdxH_off,
 143     rcx_off, rcxH_off,
 144     rax_off, raxH_off,
 145     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 146     align_off, alignH_off,
 147     flags_off, flagsH_off,
 148     // The frame sender code expects that rbp will be in the "natural" place and
 149     // will override any oopMap setting for it. We must therefore force the layout
 150     // so that it agrees with the frame sender code.
 151     rbp_off, rbpH_off,        // copy of rbp we will restore
 152     return_off, returnH_off,  // slot for return address
 153     reg_save_size             // size in compiler stack slots
 154   };
 155 
 156  public:
 157   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 158   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 159 
 160   // Offsets into the register save area
 161   // Used by deoptimization when it is managing result register
 162   // values on its own
 163 
 164   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 165   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 166   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 167   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 168   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 169   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 170 
 171   // During deoptimization only the result registers need to be restored,
 172   // all the other values have already been extracted.
 173   static void restore_result_registers(MacroAssembler* masm);
 174 };
 175 
 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 177   int off = 0;
 178   int num_xmm_regs = XMMRegister::available_xmm_registers();
 179 #if COMPILER2_OR_JVMCI
 180   if (save_wide_vectors && UseAVX == 0) {
 181     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 182   }
 183   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 184 #else
 185   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 186 #endif
 187 
 188   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 189   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 190   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 191   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 192   // CodeBlob frame size is in words.
 193   int frame_size_in_words = frame_size_in_bytes / wordSize;
 194   *total_frame_words = frame_size_in_words;
 195 
 196   // Save registers, fpu state, and flags.
 197   // We assume caller has already pushed the return address onto the
 198   // stack, so rsp is 8-byte aligned here.
 199   // We push rpb twice in this sequence because we want the real rbp
 200   // to be under the return like a normal enter.
 201 
 202   __ enter();          // rsp becomes 16-byte aligned here
 203   __ push_CPU_state(); // Push a multiple of 16 bytes
 204 
 205   // push cpu state handles this on EVEX enabled targets
 206   if (save_wide_vectors) {
 207     // Save upper half of YMM registers(0..15)
 208     int base_addr = XSAVE_AREA_YMM_BEGIN;
 209     for (int n = 0; n < 16; n++) {
 210       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 211     }
 212     if (VM_Version::supports_evex()) {
 213       // Save upper half of ZMM registers(0..15)
 214       base_addr = XSAVE_AREA_ZMM_BEGIN;
 215       for (int n = 0; n < 16; n++) {
 216         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 217       }
 218       // Save full ZMM registers(16..num_xmm_regs)
 219       base_addr = XSAVE_AREA_UPPERBANK;
 220       off = 0;
 221       int vector_len = Assembler::AVX_512bit;
 222       for (int n = 16; n < num_xmm_regs; n++) {
 223         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 224       }
 225 #if COMPILER2_OR_JVMCI
 226       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 227       off = 0;
 228       for(int n = 0; n < KRegister::number_of_registers; n++) {
 229         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 230       }
 231 #endif
 232     }
 233   } else {
 234     if (VM_Version::supports_evex()) {
 235       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 236       int base_addr = XSAVE_AREA_UPPERBANK;
 237       off = 0;
 238       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 239       for (int n = 16; n < num_xmm_regs; n++) {
 240         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 241       }
 242 #if COMPILER2_OR_JVMCI
 243       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 244       off = 0;
 245       for(int n = 0; n < KRegister::number_of_registers; n++) {
 246         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 247       }
 248 #endif
 249     }
 250   }
 251   __ vzeroupper();
 252   if (frame::arg_reg_save_area_bytes != 0) {
 253     // Allocate argument register save area
 254     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 255   }
 256 
 257   // Set an oopmap for the call site.  This oopmap will map all
 258   // oop-registers and debug-info registers as callee-saved.  This
 259   // will allow deoptimization at this safepoint to find all possible
 260   // debug-info recordings, as well as let GC find all oops.
 261 
 262   OopMapSet *oop_maps = new OopMapSet();
 263   OopMap* map = new OopMap(frame_size_in_slots, 0);
 264 
 265 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 266 
 267   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 271   // rbp location is known implicitly by the frame sender code, needs no oopmap
 272   // and the location where rbp was saved by is ignored
 273   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 283   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 284   // on EVEX enabled targets, we get it included in the xsave area
 285   off = xmm0_off;
 286   int delta = xmm1_off - off;
 287   for (int n = 0; n < 16; n++) {
 288     XMMRegister xmm_name = as_XMMRegister(n);
 289     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 290     off += delta;
 291   }
 292   if (UseAVX > 2) {
 293     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 294     off = zmm16_off;
 295     delta = zmm17_off - off;
 296     for (int n = 16; n < num_xmm_regs; n++) {
 297       XMMRegister zmm_name = as_XMMRegister(n);
 298       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 299       off += delta;
 300     }
 301   }
 302 
 303 #if COMPILER2_OR_JVMCI
 304   if (save_wide_vectors) {
 305     // Save upper half of YMM registers(0..15)
 306     off = ymm0_off;
 307     delta = ymm1_off - ymm0_off;
 308     for (int n = 0; n < 16; n++) {
 309       XMMRegister ymm_name = as_XMMRegister(n);
 310       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 311       off += delta;
 312     }
 313     if (VM_Version::supports_evex()) {
 314       // Save upper half of ZMM registers(0..15)
 315       off = zmm0_off;
 316       delta = zmm1_off - zmm0_off;
 317       for (int n = 0; n < 16; n++) {
 318         XMMRegister zmm_name = as_XMMRegister(n);
 319         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 320         off += delta;
 321       }
 322     }
 323   }
 324 #endif // COMPILER2_OR_JVMCI
 325 
 326   // %%% These should all be a waste but we'll keep things as they were for now
 327   if (true) {
 328     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 332     // rbp location is known implicitly by the frame sender code, needs no oopmap
 333     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 343     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 344     // on EVEX enabled targets, we get it included in the xsave area
 345     off = xmm0H_off;
 346     delta = xmm1H_off - off;
 347     for (int n = 0; n < 16; n++) {
 348       XMMRegister xmm_name = as_XMMRegister(n);
 349       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 350       off += delta;
 351     }
 352     if (UseAVX > 2) {
 353       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 354       off = zmm16H_off;
 355       delta = zmm17H_off - off;
 356       for (int n = 16; n < num_xmm_regs; n++) {
 357         XMMRegister zmm_name = as_XMMRegister(n);
 358         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 359         off += delta;
 360       }
 361     }
 362   }
 363 
 364   return map;
 365 }
 366 
 367 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 368   int num_xmm_regs = XMMRegister::available_xmm_registers();
 369   if (frame::arg_reg_save_area_bytes != 0) {
 370     // Pop arg register save area
 371     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 372   }
 373 
 374 #if COMPILER2_OR_JVMCI
 375   if (restore_wide_vectors) {
 376     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 377     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 378   }
 379 #else
 380   assert(!restore_wide_vectors, "vectors are generated only by C2");
 381 #endif
 382 
 383   __ vzeroupper();
 384 
 385   // On EVEX enabled targets everything is handled in pop fpu state
 386   if (restore_wide_vectors) {
 387     // Restore upper half of YMM registers (0..15)
 388     int base_addr = XSAVE_AREA_YMM_BEGIN;
 389     for (int n = 0; n < 16; n++) {
 390       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 391     }
 392     if (VM_Version::supports_evex()) {
 393       // Restore upper half of ZMM registers (0..15)
 394       base_addr = XSAVE_AREA_ZMM_BEGIN;
 395       for (int n = 0; n < 16; n++) {
 396         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 397       }
 398       // Restore full ZMM registers(16..num_xmm_regs)
 399       base_addr = XSAVE_AREA_UPPERBANK;
 400       int vector_len = Assembler::AVX_512bit;
 401       int off = 0;
 402       for (int n = 16; n < num_xmm_regs; n++) {
 403         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 404       }
 405 #if COMPILER2_OR_JVMCI
 406       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 407       off = 0;
 408       for (int n = 0; n < KRegister::number_of_registers; n++) {
 409         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 410       }
 411 #endif
 412     }
 413   } else {
 414     if (VM_Version::supports_evex()) {
 415       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 416       int base_addr = XSAVE_AREA_UPPERBANK;
 417       int off = 0;
 418       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 419       for (int n = 16; n < num_xmm_regs; n++) {
 420         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 421       }
 422 #if COMPILER2_OR_JVMCI
 423       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 424       off = 0;
 425       for (int n = 0; n < KRegister::number_of_registers; n++) {
 426         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 427       }
 428 #endif
 429     }
 430   }
 431 
 432   // Recover CPU state
 433   __ pop_CPU_state();
 434   // Get the rbp described implicitly by the calling convention (no oopMap)
 435   __ pop(rbp);
 436 }
 437 
 438 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 439 
 440   // Just restore result register. Only used by deoptimization. By
 441   // now any callee save register that needs to be restored to a c2
 442   // caller of the deoptee has been extracted into the vframeArray
 443   // and will be stuffed into the c2i adapter we create for later
 444   // restoration so only result registers need to be restored here.
 445 
 446   // Restore fp result register
 447   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 448   // Restore integer result register
 449   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 450   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 451 
 452   // Pop all of the register save are off the stack except the return address
 453   __ addptr(rsp, return_offset_in_bytes());
 454 }
 455 
 456 // Is vector's size (in bytes) bigger than a size saved by default?
 457 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 458 bool SharedRuntime::is_wide_vector(int size) {
 459   return size > 16;
 460 }
 461 
 462 // ---------------------------------------------------------------------------
 463 // Read the array of BasicTypes from a signature, and compute where the
 464 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 465 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 466 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 467 // as framesizes are fixed.
 468 // VMRegImpl::stack0 refers to the first slot 0(sp).
 469 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 470 // Register up to Register::number_of_registers are the 64-bit
 471 // integer registers.
 472 
 473 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 474 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 475 // units regardless of build. Of course for i486 there is no 64 bit build
 476 
 477 // The Java calling convention is a "shifted" version of the C ABI.
 478 // By skipping the first C ABI register we can call non-static jni methods
 479 // with small numbers of arguments without having to shuffle the arguments
 480 // at all. Since we control the java ABI we ought to at least get some
 481 // advantage out of it.
 482 
 483 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 484                                            VMRegPair *regs,
 485                                            int total_args_passed) {
 486 
 487   // Create the mapping between argument positions and
 488   // registers.
 489   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 490     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 491   };
 492   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 493     j_farg0, j_farg1, j_farg2, j_farg3,
 494     j_farg4, j_farg5, j_farg6, j_farg7
 495   };
 496 
 497 
 498   uint int_args = 0;
 499   uint fp_args = 0;
 500   uint stk_args = 0;
 501 
 502   for (int i = 0; i < total_args_passed; i++) {
 503     switch (sig_bt[i]) {
 504     case T_BOOLEAN:
 505     case T_CHAR:
 506     case T_BYTE:
 507     case T_SHORT:
 508     case T_INT:
 509       if (int_args < Argument::n_int_register_parameters_j) {
 510         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 511       } else {
 512         stk_args = align_up(stk_args, 2);
 513         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 514         stk_args += 1;
 515       }
 516       break;
 517     case T_VOID:
 518       // halves of T_LONG or T_DOUBLE
 519       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 520       regs[i].set_bad();
 521       break;
 522     case T_LONG:
 523       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 524       // fall through
 525     case T_OBJECT:
 526     case T_ARRAY:
 527     case T_ADDRESS:
 528       if (int_args < Argument::n_int_register_parameters_j) {
 529         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 530       } else {
 531         stk_args = align_up(stk_args, 2);
 532         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 533         stk_args += 2;
 534       }
 535       break;
 536     case T_FLOAT:
 537       if (fp_args < Argument::n_float_register_parameters_j) {
 538         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 539       } else {
 540         stk_args = align_up(stk_args, 2);
 541         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 542         stk_args += 1;
 543       }
 544       break;
 545     case T_DOUBLE:
 546       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 547       if (fp_args < Argument::n_float_register_parameters_j) {
 548         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 549       } else {
 550         stk_args = align_up(stk_args, 2);
 551         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 552         stk_args += 2;
 553       }
 554       break;
 555     default:
 556       ShouldNotReachHere();
 557       break;
 558     }
 559   }
 560 
 561   return stk_args;
 562 }
 563 
 564 // Patch the callers callsite with entry to compiled code if it exists.
 565 static void patch_callers_callsite(MacroAssembler *masm) {
 566   Label L;
 567   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 568   __ jcc(Assembler::equal, L);
 569 
 570   // Save the current stack pointer
 571   __ mov(r13, rsp);
 572   // Schedule the branch target address early.
 573   // Call into the VM to patch the caller, then jump to compiled callee
 574   // rax isn't live so capture return address while we easily can
 575   __ movptr(rax, Address(rsp, 0));
 576 
 577   // align stack so push_CPU_state doesn't fault
 578   __ andptr(rsp, -(StackAlignmentInBytes));
 579   __ push_CPU_state();
 580   __ vzeroupper();
 581   // VM needs caller's callsite
 582   // VM needs target method
 583   // This needs to be a long call since we will relocate this adapter to
 584   // the codeBuffer and it may not reach
 585 
 586   // Allocate argument register save area
 587   if (frame::arg_reg_save_area_bytes != 0) {
 588     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 589   }
 590   __ mov(c_rarg0, rbx);
 591   __ mov(c_rarg1, rax);
 592   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 593 
 594   // De-allocate argument register save area
 595   if (frame::arg_reg_save_area_bytes != 0) {
 596     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 597   }
 598 
 599   __ vzeroupper();
 600   __ pop_CPU_state();
 601   // restore sp
 602   __ mov(rsp, r13);
 603   __ bind(L);
 604 }
 605 
 606 
 607 static void gen_c2i_adapter(MacroAssembler *masm,
 608                             int total_args_passed,
 609                             int comp_args_on_stack,
 610                             const BasicType *sig_bt,
 611                             const VMRegPair *regs,
 612                             Label& skip_fixup) {
 613   // Before we get into the guts of the C2I adapter, see if we should be here
 614   // at all.  We've come from compiled code and are attempting to jump to the
 615   // interpreter, which means the caller made a static call to get here
 616   // (vcalls always get a compiled target if there is one).  Check for a
 617   // compiled target.  If there is one, we need to patch the caller's call.
 618   patch_callers_callsite(masm);
 619 
 620   __ bind(skip_fixup);
 621 
 622   // Since all args are passed on the stack, total_args_passed *
 623   // Interpreter::stackElementSize is the space we need.
 624 
 625   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 626 
 627   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 628 
 629   // stack is aligned, keep it that way
 630   // This is not currently needed or enforced by the interpreter, but
 631   // we might as well conform to the ABI.
 632   extraspace = align_up(extraspace, 2*wordSize);
 633 
 634   // set senderSP value
 635   __ lea(r13, Address(rsp, wordSize));
 636 
 637 #ifdef ASSERT
 638   __ check_stack_alignment(r13, "sender stack not aligned");
 639 #endif
 640   if (extraspace > 0) {
 641     // Pop the return address
 642     __ pop(rax);
 643 
 644     __ subptr(rsp, extraspace);
 645 
 646     // Push the return address
 647     __ push(rax);
 648 
 649     // Account for the return address location since we store it first rather
 650     // than hold it in a register across all the shuffling
 651     extraspace += wordSize;
 652   }
 653 
 654 #ifdef ASSERT
 655   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 656 #endif
 657 
 658   // Now write the args into the outgoing interpreter space
 659   for (int i = 0; i < total_args_passed; i++) {
 660     if (sig_bt[i] == T_VOID) {
 661       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 662       continue;
 663     }
 664 
 665     // offset to start parameters
 666     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 667     int next_off = st_off - Interpreter::stackElementSize;
 668 
 669     // Say 4 args:
 670     // i   st_off
 671     // 0   32 T_LONG
 672     // 1   24 T_VOID
 673     // 2   16 T_OBJECT
 674     // 3    8 T_BOOL
 675     // -    0 return address
 676     //
 677     // However to make thing extra confusing. Because we can fit a long/double in
 678     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 679     // leaves one slot empty and only stores to a single slot. In this case the
 680     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 681 
 682     VMReg r_1 = regs[i].first();
 683     VMReg r_2 = regs[i].second();
 684     if (!r_1->is_valid()) {
 685       assert(!r_2->is_valid(), "");
 686       continue;
 687     }
 688     if (r_1->is_stack()) {
 689       // memory to memory use rax
 690       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 691       if (!r_2->is_valid()) {
 692         // sign extend??
 693         __ movl(rax, Address(rsp, ld_off));
 694         __ movptr(Address(rsp, st_off), rax);
 695 
 696       } else {
 697 
 698         __ movq(rax, Address(rsp, ld_off));
 699 
 700         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 701         // T_DOUBLE and T_LONG use two slots in the interpreter
 702         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 703           // ld_off == LSW, ld_off+wordSize == MSW
 704           // st_off == MSW, next_off == LSW
 705           __ movq(Address(rsp, next_off), rax);
 706 #ifdef ASSERT
 707           // Overwrite the unused slot with known junk
 708           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 709           __ movptr(Address(rsp, st_off), rax);
 710 #endif /* ASSERT */
 711         } else {
 712           __ movq(Address(rsp, st_off), rax);
 713         }
 714       }
 715     } else if (r_1->is_Register()) {
 716       Register r = r_1->as_Register();
 717       if (!r_2->is_valid()) {
 718         // must be only an int (or less ) so move only 32bits to slot
 719         // why not sign extend??
 720         __ movl(Address(rsp, st_off), r);
 721       } else {
 722         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 723         // T_DOUBLE and T_LONG use two slots in the interpreter
 724         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 725           // long/double in gpr
 726 #ifdef ASSERT
 727           // Overwrite the unused slot with known junk
 728           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 729           __ movptr(Address(rsp, st_off), rax);
 730 #endif /* ASSERT */
 731           __ movq(Address(rsp, next_off), r);
 732         } else {
 733           __ movptr(Address(rsp, st_off), r);
 734         }
 735       }
 736     } else {
 737       assert(r_1->is_XMMRegister(), "");
 738       if (!r_2->is_valid()) {
 739         // only a float use just part of the slot
 740         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 741       } else {
 742 #ifdef ASSERT
 743         // Overwrite the unused slot with known junk
 744         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 745         __ movptr(Address(rsp, st_off), rax);
 746 #endif /* ASSERT */
 747         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 748       }
 749     }
 750   }
 751 
 752   // Schedule the branch target address early.
 753   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 754   __ jmp(rcx);
 755 }
 756 
 757 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 758                         address code_start, address code_end,
 759                         Label& L_ok) {
 760   Label L_fail;
 761   __ lea(temp_reg, ExternalAddress(code_start));
 762   __ cmpptr(pc_reg, temp_reg);
 763   __ jcc(Assembler::belowEqual, L_fail);
 764   __ lea(temp_reg, ExternalAddress(code_end));
 765   __ cmpptr(pc_reg, temp_reg);
 766   __ jcc(Assembler::below, L_ok);
 767   __ bind(L_fail);
 768 }
 769 
 770 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 771                                     int total_args_passed,
 772                                     int comp_args_on_stack,
 773                                     const BasicType *sig_bt,
 774                                     const VMRegPair *regs) {
 775 
 776   // Note: r13 contains the senderSP on entry. We must preserve it since
 777   // we may do a i2c -> c2i transition if we lose a race where compiled
 778   // code goes non-entrant while we get args ready.
 779   // In addition we use r13 to locate all the interpreter args as
 780   // we must align the stack to 16 bytes on an i2c entry else we
 781   // lose alignment we expect in all compiled code and register
 782   // save code can segv when fxsave instructions find improperly
 783   // aligned stack pointer.
 784 
 785   // Adapters can be frameless because they do not require the caller
 786   // to perform additional cleanup work, such as correcting the stack pointer.
 787   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 788   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 789   // even if a callee has modified the stack pointer.
 790   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 791   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 792   // up via the senderSP register).
 793   // In other words, if *either* the caller or callee is interpreted, we can
 794   // get the stack pointer repaired after a call.
 795   // This is why c2i and i2c adapters cannot be indefinitely composed.
 796   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 797   // both caller and callee would be compiled methods, and neither would
 798   // clean up the stack pointer changes performed by the two adapters.
 799   // If this happens, control eventually transfers back to the compiled
 800   // caller, but with an uncorrected stack, causing delayed havoc.
 801 
 802   if (VerifyAdapterCalls &&
 803       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 804     // So, let's test for cascading c2i/i2c adapters right now.
 805     //  assert(Interpreter::contains($return_addr) ||
 806     //         StubRoutines::contains($return_addr),
 807     //         "i2c adapter must return to an interpreter frame");
 808     __ block_comment("verify_i2c { ");
 809     // Pick up the return address
 810     __ movptr(rax, Address(rsp, 0));
 811     Label L_ok;
 812     if (Interpreter::code() != nullptr) {
 813       range_check(masm, rax, r11,
 814                   Interpreter::code()->code_start(),
 815                   Interpreter::code()->code_end(),
 816                   L_ok);
 817     }
 818     if (StubRoutines::initial_stubs_code() != nullptr) {
 819       range_check(masm, rax, r11,
 820                   StubRoutines::initial_stubs_code()->code_begin(),
 821                   StubRoutines::initial_stubs_code()->code_end(),
 822                   L_ok);
 823     }
 824     if (StubRoutines::final_stubs_code() != nullptr) {
 825       range_check(masm, rax, r11,
 826                   StubRoutines::final_stubs_code()->code_begin(),
 827                   StubRoutines::final_stubs_code()->code_end(),
 828                   L_ok);
 829     }
 830     const char* msg = "i2c adapter must return to an interpreter frame";
 831     __ block_comment(msg);
 832     __ stop(msg);
 833     __ bind(L_ok);
 834     __ block_comment("} verify_i2ce ");
 835   }
 836 
 837   // Must preserve original SP for loading incoming arguments because
 838   // we need to align the outgoing SP for compiled code.
 839   __ movptr(r11, rsp);
 840 
 841   // Pick up the return address
 842   __ pop(rax);
 843 
 844   // Convert 4-byte c2 stack slots to words.
 845   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 846 
 847   if (comp_args_on_stack) {
 848     __ subptr(rsp, comp_words_on_stack * wordSize);
 849   }
 850 
 851   // Ensure compiled code always sees stack at proper alignment
 852   __ andptr(rsp, -16);
 853 
 854   // push the return address and misalign the stack that youngest frame always sees
 855   // as far as the placement of the call instruction
 856   __ push(rax);
 857 
 858   // Put saved SP in another register
 859   const Register saved_sp = rax;
 860   __ movptr(saved_sp, r11);
 861 
 862   // Will jump to the compiled code just as if compiled code was doing it.
 863   // Pre-load the register-jump target early, to schedule it better.
 864   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 865 
 866 #if INCLUDE_JVMCI
 867   if (EnableJVMCI) {
 868     // check if this call should be routed towards a specific entry point
 869     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 870     Label no_alternative_target;
 871     __ jcc(Assembler::equal, no_alternative_target);
 872     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 873     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 874     __ bind(no_alternative_target);
 875   }
 876 #endif // INCLUDE_JVMCI
 877 
 878   // Now generate the shuffle code.  Pick up all register args and move the
 879   // rest through the floating point stack top.
 880   for (int i = 0; i < total_args_passed; i++) {
 881     if (sig_bt[i] == T_VOID) {
 882       // Longs and doubles are passed in native word order, but misaligned
 883       // in the 32-bit build.
 884       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 885       continue;
 886     }
 887 
 888     // Pick up 0, 1 or 2 words from SP+offset.
 889 
 890     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 891             "scrambled load targets?");
 892     // Load in argument order going down.
 893     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 894     // Point to interpreter value (vs. tag)
 895     int next_off = ld_off - Interpreter::stackElementSize;
 896     //
 897     //
 898     //
 899     VMReg r_1 = regs[i].first();
 900     VMReg r_2 = regs[i].second();
 901     if (!r_1->is_valid()) {
 902       assert(!r_2->is_valid(), "");
 903       continue;
 904     }
 905     if (r_1->is_stack()) {
 906       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 907       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 908 
 909       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 910       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 911       // will be generated.
 912       if (!r_2->is_valid()) {
 913         // sign extend???
 914         __ movl(r13, Address(saved_sp, ld_off));
 915         __ movptr(Address(rsp, st_off), r13);
 916       } else {
 917         //
 918         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 919         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 920         // So we must adjust where to pick up the data to match the interpreter.
 921         //
 922         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 923         // are accessed as negative so LSW is at LOW address
 924 
 925         // ld_off is MSW so get LSW
 926         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 927                            next_off : ld_off;
 928         __ movq(r13, Address(saved_sp, offset));
 929         // st_off is LSW (i.e. reg.first())
 930         __ movq(Address(rsp, st_off), r13);
 931       }
 932     } else if (r_1->is_Register()) {  // Register argument
 933       Register r = r_1->as_Register();
 934       assert(r != rax, "must be different");
 935       if (r_2->is_valid()) {
 936         //
 937         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 938         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 939         // So we must adjust where to pick up the data to match the interpreter.
 940 
 941         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 942                            next_off : ld_off;
 943 
 944         // this can be a misaligned move
 945         __ movq(r, Address(saved_sp, offset));
 946       } else {
 947         // sign extend and use a full word?
 948         __ movl(r, Address(saved_sp, ld_off));
 949       }
 950     } else {
 951       if (!r_2->is_valid()) {
 952         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 953       } else {
 954         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 955       }
 956     }
 957   }
 958 
 959   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 960 
 961   // 6243940 We might end up in handle_wrong_method if
 962   // the callee is deoptimized as we race thru here. If that
 963   // happens we don't want to take a safepoint because the
 964   // caller frame will look interpreted and arguments are now
 965   // "compiled" so it is much better to make this transition
 966   // invisible to the stack walking code. Unfortunately if
 967   // we try and find the callee by normal means a safepoint
 968   // is possible. So we stash the desired callee in the thread
 969   // and the vm will find there should this case occur.
 970 
 971   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 972 
 973   // put Method* where a c2i would expect should we end up there
 974   // only needed because eof c2 resolve stubs return Method* as a result in
 975   // rax
 976   __ mov(rax, rbx);
 977   __ jmp(r11);
 978 }
 979 
 980 // ---------------------------------------------------------------
 981 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 982                                                             int total_args_passed,
 983                                                             int comp_args_on_stack,
 984                                                             const BasicType *sig_bt,
 985                                                             const VMRegPair *regs,
 986                                                             AdapterFingerPrint* fingerprint) {
 987   address i2c_entry = __ pc();
 988 
 989   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 990 
 991   // -------------------------------------------------------------------------
 992   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 993   // to the interpreter.  The args start out packed in the compiled layout.  They
 994   // need to be unpacked into the interpreter layout.  This will almost always
 995   // require some stack space.  We grow the current (compiled) stack, then repack
 996   // the args.  We  finally end in a jump to the generic interpreter entry point.
 997   // On exit from the interpreter, the interpreter will restore our SP (lest the
 998   // compiled code, which relies solely on SP and not RBP, get sick).
 999 
1000   address c2i_unverified_entry = __ pc();
1001   Label skip_fixup;
1002 
1003   Register data = rax;
1004   Register receiver = j_rarg0;
1005   Register temp = rbx;
1006 
1007   {
1008     __ ic_check(1 /* end_alignment */);
1009     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1010     // Method might have been compiled since the call site was patched to
1011     // interpreted if that is the case treat it as a miss so we can get
1012     // the call site corrected.
1013     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1014     __ jcc(Assembler::equal, skip_fixup);
1015     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1016   }
1017 
1018   address c2i_entry = __ pc();
1019 
1020   // Class initialization barrier for static methods
1021   address c2i_no_clinit_check_entry = nullptr;
1022   if (VM_Version::supports_fast_class_init_checks()) {
1023     Label L_skip_barrier;
1024     Register method = rbx;
1025 
1026     { // Bypass the barrier for non-static methods
1027       Register flags = rscratch1;
1028       __ movl(flags, Address(method, Method::access_flags_offset()));
1029       __ testl(flags, JVM_ACC_STATIC);
1030       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1031     }
1032 
1033     Register klass = rscratch1;
1034     __ load_method_holder(klass, method);
1035     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1036 
1037     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1038 
1039     __ bind(L_skip_barrier);
1040     c2i_no_clinit_check_entry = __ pc();
1041   }
1042 
1043   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1044   bs->c2i_entry_barrier(masm);
1045 
1046   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1047 
1048   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1049 }
1050 
1051 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1052                                          VMRegPair *regs,
1053                                          int total_args_passed) {
1054 
1055 // We return the amount of VMRegImpl stack slots we need to reserve for all
1056 // the arguments NOT counting out_preserve_stack_slots.
1057 
1058 // NOTE: These arrays will have to change when c1 is ported
1059 #ifdef _WIN64
1060     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1061       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1062     };
1063     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1064       c_farg0, c_farg1, c_farg2, c_farg3
1065     };
1066 #else
1067     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1068       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1069     };
1070     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1071       c_farg0, c_farg1, c_farg2, c_farg3,
1072       c_farg4, c_farg5, c_farg6, c_farg7
1073     };
1074 #endif // _WIN64
1075 
1076 
1077     uint int_args = 0;
1078     uint fp_args = 0;
1079     uint stk_args = 0; // inc by 2 each time
1080 
1081     for (int i = 0; i < total_args_passed; i++) {
1082       switch (sig_bt[i]) {
1083       case T_BOOLEAN:
1084       case T_CHAR:
1085       case T_BYTE:
1086       case T_SHORT:
1087       case T_INT:
1088         if (int_args < Argument::n_int_register_parameters_c) {
1089           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1090 #ifdef _WIN64
1091           fp_args++;
1092           // Allocate slots for callee to stuff register args the stack.
1093           stk_args += 2;
1094 #endif
1095         } else {
1096           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1097           stk_args += 2;
1098         }
1099         break;
1100       case T_LONG:
1101         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1102         // fall through
1103       case T_OBJECT:
1104       case T_ARRAY:
1105       case T_ADDRESS:
1106       case T_METADATA:
1107         if (int_args < Argument::n_int_register_parameters_c) {
1108           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1109 #ifdef _WIN64
1110           fp_args++;
1111           stk_args += 2;
1112 #endif
1113         } else {
1114           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1115           stk_args += 2;
1116         }
1117         break;
1118       case T_FLOAT:
1119         if (fp_args < Argument::n_float_register_parameters_c) {
1120           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1121 #ifdef _WIN64
1122           int_args++;
1123           // Allocate slots for callee to stuff register args the stack.
1124           stk_args += 2;
1125 #endif
1126         } else {
1127           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1128           stk_args += 2;
1129         }
1130         break;
1131       case T_DOUBLE:
1132         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1133         if (fp_args < Argument::n_float_register_parameters_c) {
1134           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1135 #ifdef _WIN64
1136           int_args++;
1137           // Allocate slots for callee to stuff register args the stack.
1138           stk_args += 2;
1139 #endif
1140         } else {
1141           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1142           stk_args += 2;
1143         }
1144         break;
1145       case T_VOID: // Halves of longs and doubles
1146         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1147         regs[i].set_bad();
1148         break;
1149       default:
1150         ShouldNotReachHere();
1151         break;
1152       }
1153     }
1154 #ifdef _WIN64
1155   // windows abi requires that we always allocate enough stack space
1156   // for 4 64bit registers to be stored down.
1157   if (stk_args < 8) {
1158     stk_args = 8;
1159   }
1160 #endif // _WIN64
1161 
1162   return stk_args;
1163 }
1164 
1165 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1166                                              uint num_bits,
1167                                              uint total_args_passed) {
1168   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1169          "only certain vector sizes are supported for now");
1170 
1171   static const XMMRegister VEC_ArgReg[32] = {
1172      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1173      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1174     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1175     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1176   };
1177 
1178   uint stk_args = 0;
1179   uint fp_args = 0;
1180 
1181   for (uint i = 0; i < total_args_passed; i++) {
1182     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1183     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1184     regs[i].set_pair(vmreg->next(next_val), vmreg);
1185   }
1186 
1187   return stk_args;
1188 }
1189 
1190 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1191   // We always ignore the frame_slots arg and just use the space just below frame pointer
1192   // which by this time is free to use
1193   switch (ret_type) {
1194   case T_FLOAT:
1195     __ movflt(Address(rbp, -wordSize), xmm0);
1196     break;
1197   case T_DOUBLE:
1198     __ movdbl(Address(rbp, -wordSize), xmm0);
1199     break;
1200   case T_VOID:  break;
1201   default: {
1202     __ movptr(Address(rbp, -wordSize), rax);
1203     }
1204   }
1205 }
1206 
1207 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1208   // We always ignore the frame_slots arg and just use the space just below frame pointer
1209   // which by this time is free to use
1210   switch (ret_type) {
1211   case T_FLOAT:
1212     __ movflt(xmm0, Address(rbp, -wordSize));
1213     break;
1214   case T_DOUBLE:
1215     __ movdbl(xmm0, Address(rbp, -wordSize));
1216     break;
1217   case T_VOID:  break;
1218   default: {
1219     __ movptr(rax, Address(rbp, -wordSize));
1220     }
1221   }
1222 }
1223 
1224 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1225     for ( int i = first_arg ; i < arg_count ; i++ ) {
1226       if (args[i].first()->is_Register()) {
1227         __ push(args[i].first()->as_Register());
1228       } else if (args[i].first()->is_XMMRegister()) {
1229         __ subptr(rsp, 2*wordSize);
1230         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1231       }
1232     }
1233 }
1234 
1235 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1236     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1237       if (args[i].first()->is_Register()) {
1238         __ pop(args[i].first()->as_Register());
1239       } else if (args[i].first()->is_XMMRegister()) {
1240         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1241         __ addptr(rsp, 2*wordSize);
1242       }
1243     }
1244 }
1245 
1246 static void verify_oop_args(MacroAssembler* masm,
1247                             const methodHandle& method,
1248                             const BasicType* sig_bt,
1249                             const VMRegPair* regs) {
1250   Register temp_reg = rbx;  // not part of any compiled calling seq
1251   if (VerifyOops) {
1252     for (int i = 0; i < method->size_of_parameters(); i++) {
1253       if (is_reference_type(sig_bt[i])) {
1254         VMReg r = regs[i].first();
1255         assert(r->is_valid(), "bad oop arg");
1256         if (r->is_stack()) {
1257           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1258           __ verify_oop(temp_reg);
1259         } else {
1260           __ verify_oop(r->as_Register());
1261         }
1262       }
1263     }
1264   }
1265 }
1266 
1267 static void check_continuation_enter_argument(VMReg actual_vmreg,
1268                                               Register expected_reg,
1269                                               const char* name) {
1270   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1271   assert(actual_vmreg->as_Register() == expected_reg,
1272          "%s is in unexpected register: %s instead of %s",
1273          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1274 }
1275 
1276 
1277 //---------------------------- continuation_enter_setup ---------------------------
1278 //
1279 // Arguments:
1280 //   None.
1281 //
1282 // Results:
1283 //   rsp: pointer to blank ContinuationEntry
1284 //
1285 // Kills:
1286 //   rax
1287 //
1288 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1289   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1290   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1291   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1292 
1293   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1294   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1295 
1296   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1297   OopMap* map = new OopMap(frame_size, 0);
1298 
1299   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1300   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1301   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1302 
1303   return map;
1304 }
1305 
1306 //---------------------------- fill_continuation_entry ---------------------------
1307 //
1308 // Arguments:
1309 //   rsp: pointer to blank Continuation entry
1310 //   reg_cont_obj: pointer to the continuation
1311 //   reg_flags: flags
1312 //
1313 // Results:
1314 //   rsp: pointer to filled out ContinuationEntry
1315 //
1316 // Kills:
1317 //   rax
1318 //
1319 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1320   assert_different_registers(rax, reg_cont_obj, reg_flags);
1321 #ifdef ASSERT
1322   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1323 #endif
1324   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1325   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1326   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1327   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1328   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1329 
1330   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1331   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1332   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1333   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1334 
1335   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1336   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1337 }
1338 
1339 //---------------------------- continuation_enter_cleanup ---------------------------
1340 //
1341 // Arguments:
1342 //   rsp: pointer to the ContinuationEntry
1343 //
1344 // Results:
1345 //   rsp: pointer to the spilled rbp in the entry frame
1346 //
1347 // Kills:
1348 //   rbx
1349 //
1350 static void continuation_enter_cleanup(MacroAssembler* masm) {
1351 #ifdef ASSERT
1352   Label L_good_sp;
1353   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1354   __ jcc(Assembler::equal, L_good_sp);
1355   __ stop("Incorrect rsp at continuation_enter_cleanup");
1356   __ bind(L_good_sp);
1357 #endif
1358 
1359   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1360   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1361   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1362   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1363 
1364   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1365   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1366   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1367 }
1368 
1369 static void gen_continuation_enter(MacroAssembler* masm,
1370                                    const VMRegPair* regs,
1371                                    int& exception_offset,
1372                                    OopMapSet* oop_maps,
1373                                    int& frame_complete,
1374                                    int& stack_slots,
1375                                    int& interpreted_entry_offset,
1376                                    int& compiled_entry_offset) {
1377 
1378   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1379   int pos_cont_obj   = 0;
1380   int pos_is_cont    = 1;
1381   int pos_is_virtual = 2;
1382 
1383   // The platform-specific calling convention may present the arguments in various registers.
1384   // To simplify the rest of the code, we expect the arguments to reside at these known
1385   // registers, and we additionally check the placement here in case calling convention ever
1386   // changes.
1387   Register reg_cont_obj   = c_rarg1;
1388   Register reg_is_cont    = c_rarg2;
1389   Register reg_is_virtual = c_rarg3;
1390 
1391   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1392   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1393   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1394 
1395   // Utility methods kill rax, make sure there are no collisions
1396   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1397 
1398   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1399                          relocInfo::static_call_type);
1400 
1401   address start = __ pc();
1402 
1403   Label L_thaw, L_exit;
1404 
1405   // i2i entry used at interp_only_mode only
1406   interpreted_entry_offset = __ pc() - start;
1407   {
1408 #ifdef ASSERT
1409     Label is_interp_only;
1410     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1411     __ jcc(Assembler::notEqual, is_interp_only);
1412     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1413     __ bind(is_interp_only);
1414 #endif
1415 
1416     __ pop(rax); // return address
1417     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1418     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1419     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1420     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1421     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1422     __ push(rax); // return address
1423     __ push_cont_fastpath();
1424 
1425     __ enter();
1426 
1427     stack_slots = 2; // will be adjusted in setup
1428     OopMap* map = continuation_enter_setup(masm, stack_slots);
1429     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1430     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1431 
1432     __ verify_oop(reg_cont_obj);
1433 
1434     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1435 
1436     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1437     __ testptr(reg_is_cont, reg_is_cont);
1438     __ jcc(Assembler::notZero, L_thaw);
1439 
1440     // --- Resolve path
1441 
1442     // Make sure the call is patchable
1443     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1444     // Emit stub for static call
1445     CodeBuffer* cbuf = masm->code_section()->outer();
1446     address stub = CompiledDirectCall::emit_to_interp_stub(*cbuf, __ pc());
1447     if (stub == nullptr) {
1448       fatal("CodeCache is full at gen_continuation_enter");
1449     }
1450     __ call(resolve);
1451     oop_maps->add_gc_map(__ pc() - start, map);
1452     __ post_call_nop();
1453 
1454     __ jmp(L_exit);
1455   }
1456 
1457   // compiled entry
1458   __ align(CodeEntryAlignment);
1459   compiled_entry_offset = __ pc() - start;
1460   __ enter();
1461 
1462   stack_slots = 2; // will be adjusted in setup
1463   OopMap* map = continuation_enter_setup(masm, stack_slots);
1464 
1465   // Frame is now completed as far as size and linkage.
1466   frame_complete = __ pc() - start;
1467 
1468   __ verify_oop(reg_cont_obj);
1469 
1470   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1471 
1472   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1473   __ testptr(reg_is_cont, reg_is_cont);
1474   __ jccb(Assembler::notZero, L_thaw);
1475 
1476   // --- call Continuation.enter(Continuation c, boolean isContinue)
1477 
1478   // Make sure the call is patchable
1479   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1480 
1481   // Emit stub for static call
1482   CodeBuffer* cbuf = masm->code_section()->outer();
1483   address stub = CompiledDirectCall::emit_to_interp_stub(*cbuf, __ pc());
1484   if (stub == nullptr) {
1485     fatal("CodeCache is full at gen_continuation_enter");
1486   }
1487 
1488   // The call needs to be resolved. There's a special case for this in
1489   // SharedRuntime::find_callee_info_helper() which calls
1490   // LinkResolver::resolve_continuation_enter() which resolves the call to
1491   // Continuation.enter(Continuation c, boolean isContinue).
1492   __ call(resolve);
1493 
1494   oop_maps->add_gc_map(__ pc() - start, map);
1495   __ post_call_nop();
1496 
1497   __ jmpb(L_exit);
1498 
1499   // --- Thawing path
1500 
1501   __ bind(L_thaw);
1502 
1503   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1504   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1505 
1506   ContinuationEntry::_return_pc_offset = __ pc() - start;
1507   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1508   __ post_call_nop();
1509 
1510   // --- Normal exit (resolve/thawing)
1511 
1512   __ bind(L_exit);
1513 
1514   continuation_enter_cleanup(masm);
1515   __ pop(rbp);
1516   __ ret(0);
1517 
1518   // --- Exception handling path
1519 
1520   exception_offset = __ pc() - start;
1521 
1522   continuation_enter_cleanup(masm);
1523   __ pop(rbp);
1524 
1525   __ movptr(c_rarg0, r15_thread);
1526   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1527 
1528   // rax still holds the original exception oop, save it before the call
1529   __ push(rax);
1530 
1531   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1532   __ movptr(rbx, rax);
1533 
1534   // Continue at exception handler:
1535   //   rax: exception oop
1536   //   rbx: exception handler
1537   //   rdx: exception pc
1538   __ pop(rax);
1539   __ verify_oop(rax);
1540   __ pop(rdx);
1541   __ jmp(rbx);
1542 }
1543 
1544 static void gen_continuation_yield(MacroAssembler* masm,
1545                                    const VMRegPair* regs,
1546                                    OopMapSet* oop_maps,
1547                                    int& frame_complete,
1548                                    int& stack_slots,
1549                                    int& compiled_entry_offset) {
1550   enum layout {
1551     rbp_off,
1552     rbpH_off,
1553     return_off,
1554     return_off2,
1555     framesize // inclusive of return address
1556   };
1557   stack_slots = framesize /  VMRegImpl::slots_per_word;
1558   assert(stack_slots == 2, "recheck layout");
1559 
1560   address start = __ pc();
1561   compiled_entry_offset = __ pc() - start;
1562   __ enter();
1563   address the_pc = __ pc();
1564 
1565   frame_complete = the_pc - start;
1566 
1567   // This nop must be exactly at the PC we push into the frame info.
1568   // We use this nop for fast CodeBlob lookup, associate the OopMap
1569   // with it right away.
1570   __ post_call_nop();
1571   OopMap* map = new OopMap(framesize, 1);
1572   oop_maps->add_gc_map(frame_complete, map);
1573 
1574   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1575   __ movptr(c_rarg0, r15_thread);
1576   __ movptr(c_rarg1, rsp);
1577   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1578   __ reset_last_Java_frame(true);
1579 
1580   Label L_pinned;
1581 
1582   __ testptr(rax, rax);
1583   __ jcc(Assembler::notZero, L_pinned);
1584 
1585   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1586   continuation_enter_cleanup(masm);
1587   __ pop(rbp);
1588   __ ret(0);
1589 
1590   __ bind(L_pinned);
1591 
1592   // Pinned, return to caller
1593 
1594   // handle pending exception thrown by freeze
1595   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1596   Label ok;
1597   __ jcc(Assembler::equal, ok);
1598   __ leave();
1599   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1600   __ bind(ok);
1601 
1602   __ leave();
1603   __ ret(0);
1604 }
1605 
1606 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1607   ::continuation_enter_cleanup(masm);
1608 }
1609 
1610 static void gen_special_dispatch(MacroAssembler* masm,
1611                                  const methodHandle& method,
1612                                  const BasicType* sig_bt,
1613                                  const VMRegPair* regs) {
1614   verify_oop_args(masm, method, sig_bt, regs);
1615   vmIntrinsics::ID iid = method->intrinsic_id();
1616 
1617   // Now write the args into the outgoing interpreter space
1618   bool     has_receiver   = false;
1619   Register receiver_reg   = noreg;
1620   int      member_arg_pos = -1;
1621   Register member_reg     = noreg;
1622   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1623   if (ref_kind != 0) {
1624     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1625     member_reg = rbx;  // known to be free at this point
1626     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1627   } else if (iid == vmIntrinsics::_invokeBasic) {
1628     has_receiver = true;
1629   } else if (iid == vmIntrinsics::_linkToNative) {
1630     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1631     member_reg = rbx;  // known to be free at this point
1632   } else {
1633     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1634   }
1635 
1636   if (member_reg != noreg) {
1637     // Load the member_arg into register, if necessary.
1638     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1639     VMReg r = regs[member_arg_pos].first();
1640     if (r->is_stack()) {
1641       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1642     } else {
1643       // no data motion is needed
1644       member_reg = r->as_Register();
1645     }
1646   }
1647 
1648   if (has_receiver) {
1649     // Make sure the receiver is loaded into a register.
1650     assert(method->size_of_parameters() > 0, "oob");
1651     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1652     VMReg r = regs[0].first();
1653     assert(r->is_valid(), "bad receiver arg");
1654     if (r->is_stack()) {
1655       // Porting note:  This assumes that compiled calling conventions always
1656       // pass the receiver oop in a register.  If this is not true on some
1657       // platform, pick a temp and load the receiver from stack.
1658       fatal("receiver always in a register");
1659       receiver_reg = j_rarg0;  // known to be free at this point
1660       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1661     } else {
1662       // no data motion is needed
1663       receiver_reg = r->as_Register();
1664     }
1665   }
1666 
1667   // Figure out which address we are really jumping to:
1668   MethodHandles::generate_method_handle_dispatch(masm, iid,
1669                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1670 }
1671 
1672 // ---------------------------------------------------------------------------
1673 // Generate a native wrapper for a given method.  The method takes arguments
1674 // in the Java compiled code convention, marshals them to the native
1675 // convention (handlizes oops, etc), transitions to native, makes the call,
1676 // returns to java state (possibly blocking), unhandlizes any result and
1677 // returns.
1678 //
1679 // Critical native functions are a shorthand for the use of
1680 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1681 // functions.  The wrapper is expected to unpack the arguments before
1682 // passing them to the callee. Critical native functions leave the state _in_Java,
1683 // since they cannot stop for GC.
1684 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1685 // block and the check for pending exceptions it's impossible for them
1686 // to be thrown.
1687 //
1688 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1689                                                 const methodHandle& method,
1690                                                 int compile_id,
1691                                                 BasicType* in_sig_bt,
1692                                                 VMRegPair* in_regs,
1693                                                 BasicType ret_type) {
1694   if (method->is_continuation_native_intrinsic()) {
1695     int exception_offset = -1;
1696     OopMapSet* oop_maps = new OopMapSet();
1697     int frame_complete = -1;
1698     int stack_slots = -1;
1699     int interpreted_entry_offset = -1;
1700     int vep_offset = -1;
1701     if (method->is_continuation_enter_intrinsic()) {
1702       gen_continuation_enter(masm,
1703                              in_regs,
1704                              exception_offset,
1705                              oop_maps,
1706                              frame_complete,
1707                              stack_slots,
1708                              interpreted_entry_offset,
1709                              vep_offset);
1710     } else if (method->is_continuation_yield_intrinsic()) {
1711       gen_continuation_yield(masm,
1712                              in_regs,
1713                              oop_maps,
1714                              frame_complete,
1715                              stack_slots,
1716                              vep_offset);
1717     } else {
1718       guarantee(false, "Unknown Continuation native intrinsic");
1719     }
1720 
1721 #ifdef ASSERT
1722     if (method->is_continuation_enter_intrinsic()) {
1723       assert(interpreted_entry_offset != -1, "Must be set");
1724       assert(exception_offset != -1,         "Must be set");
1725     } else {
1726       assert(interpreted_entry_offset == -1, "Must be unset");
1727       assert(exception_offset == -1,         "Must be unset");
1728     }
1729     assert(frame_complete != -1,    "Must be set");
1730     assert(stack_slots != -1,       "Must be set");
1731     assert(vep_offset != -1,        "Must be set");
1732 #endif
1733 
1734     __ flush();
1735     nmethod* nm = nmethod::new_native_nmethod(method,
1736                                               compile_id,
1737                                               masm->code(),
1738                                               vep_offset,
1739                                               frame_complete,
1740                                               stack_slots,
1741                                               in_ByteSize(-1),
1742                                               in_ByteSize(-1),
1743                                               oop_maps,
1744                                               exception_offset);
1745     if (nm == nullptr) return nm;
1746     if (method->is_continuation_enter_intrinsic()) {
1747       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1748     } else if (method->is_continuation_yield_intrinsic()) {
1749       _cont_doYield_stub = nm;
1750     }
1751     return nm;
1752   }
1753 
1754   if (method->is_method_handle_intrinsic()) {
1755     vmIntrinsics::ID iid = method->intrinsic_id();
1756     intptr_t start = (intptr_t)__ pc();
1757     int vep_offset = ((intptr_t)__ pc()) - start;
1758     gen_special_dispatch(masm,
1759                          method,
1760                          in_sig_bt,
1761                          in_regs);
1762     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1763     __ flush();
1764     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1765     return nmethod::new_native_nmethod(method,
1766                                        compile_id,
1767                                        masm->code(),
1768                                        vep_offset,
1769                                        frame_complete,
1770                                        stack_slots / VMRegImpl::slots_per_word,
1771                                        in_ByteSize(-1),
1772                                        in_ByteSize(-1),
1773                                        nullptr);
1774   }
1775   address native_func = method->native_function();
1776   assert(native_func != nullptr, "must have function");
1777 
1778   // An OopMap for lock (and class if static)
1779   OopMapSet *oop_maps = new OopMapSet();
1780   intptr_t start = (intptr_t)__ pc();
1781 
1782   // We have received a description of where all the java arg are located
1783   // on entry to the wrapper. We need to convert these args to where
1784   // the jni function will expect them. To figure out where they go
1785   // we convert the java signature to a C signature by inserting
1786   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1787 
1788   const int total_in_args = method->size_of_parameters();
1789   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1790 
1791   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1792   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1793   BasicType* in_elem_bt = nullptr;
1794 
1795   int argc = 0;
1796   out_sig_bt[argc++] = T_ADDRESS;
1797   if (method->is_static()) {
1798     out_sig_bt[argc++] = T_OBJECT;
1799   }
1800 
1801   for (int i = 0; i < total_in_args ; i++ ) {
1802     out_sig_bt[argc++] = in_sig_bt[i];
1803   }
1804 
1805   // Now figure out where the args must be stored and how much stack space
1806   // they require.
1807   int out_arg_slots;
1808   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1809 
1810   // Compute framesize for the wrapper.  We need to handlize all oops in
1811   // incoming registers
1812 
1813   // Calculate the total number of stack slots we will need.
1814 
1815   // First count the abi requirement plus all of the outgoing args
1816   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1817 
1818   // Now the space for the inbound oop handle area
1819   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1820 
1821   int oop_handle_offset = stack_slots;
1822   stack_slots += total_save_slots;
1823 
1824   // Now any space we need for handlizing a klass if static method
1825 
1826   int klass_slot_offset = 0;
1827   int klass_offset = -1;
1828   int lock_slot_offset = 0;
1829   bool is_static = false;
1830 
1831   if (method->is_static()) {
1832     klass_slot_offset = stack_slots;
1833     stack_slots += VMRegImpl::slots_per_word;
1834     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1835     is_static = true;
1836   }
1837 
1838   // Plus a lock if needed
1839 
1840   if (method->is_synchronized()) {
1841     lock_slot_offset = stack_slots;
1842     stack_slots += VMRegImpl::slots_per_word;
1843   }
1844 
1845   // Now a place (+2) to save return values or temp during shuffling
1846   // + 4 for return address (which we own) and saved rbp
1847   stack_slots += 6;
1848 
1849   // Ok The space we have allocated will look like:
1850   //
1851   //
1852   // FP-> |                     |
1853   //      |---------------------|
1854   //      | 2 slots for moves   |
1855   //      |---------------------|
1856   //      | lock box (if sync)  |
1857   //      |---------------------| <- lock_slot_offset
1858   //      | klass (if static)   |
1859   //      |---------------------| <- klass_slot_offset
1860   //      | oopHandle area      |
1861   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1862   //      | outbound memory     |
1863   //      | based arguments     |
1864   //      |                     |
1865   //      |---------------------|
1866   //      |                     |
1867   // SP-> | out_preserved_slots |
1868   //
1869   //
1870 
1871 
1872   // Now compute actual number of stack words we need rounding to make
1873   // stack properly aligned.
1874   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1875 
1876   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1877 
1878   // First thing make an ic check to see if we should even be here
1879 
1880   // We are free to use all registers as temps without saving them and
1881   // restoring them except rbp. rbp is the only callee save register
1882   // as far as the interpreter and the compiler(s) are concerned.
1883 
1884   const Register receiver = j_rarg0;
1885 
1886   Label exception_pending;
1887 
1888   assert_different_registers(receiver, rscratch1, rscratch2);
1889   __ verify_oop(receiver);
1890   __ ic_check(8 /* end_alignment */);
1891 
1892   int vep_offset = ((intptr_t)__ pc()) - start;
1893 
1894   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1895     Label L_skip_barrier;
1896     Register klass = r10;
1897     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1898     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1899 
1900     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1901 
1902     __ bind(L_skip_barrier);
1903   }
1904 
1905 #ifdef COMPILER1
1906   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1907   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1908     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1909   }
1910 #endif // COMPILER1
1911 
1912   // The instruction at the verified entry point must be 5 bytes or longer
1913   // because it can be patched on the fly by make_non_entrant. The stack bang
1914   // instruction fits that requirement.
1915 
1916   // Generate stack overflow check
1917   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1918 
1919   // Generate a new frame for the wrapper.
1920   __ enter();
1921   // -2 because return address is already present and so is saved rbp
1922   __ subptr(rsp, stack_size - 2*wordSize);
1923 
1924   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1925   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1926   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1927 
1928   // Frame is now completed as far as size and linkage.
1929   int frame_complete = ((intptr_t)__ pc()) - start;
1930 
1931     if (UseRTMLocking) {
1932       // Abort RTM transaction before calling JNI
1933       // because critical section will be large and will be
1934       // aborted anyway. Also nmethod could be deoptimized.
1935       __ xabort(0);
1936     }
1937 
1938 #ifdef ASSERT
1939   __ check_stack_alignment(rsp, "improperly aligned stack");
1940 #endif /* ASSERT */
1941 
1942 
1943   // We use r14 as the oop handle for the receiver/klass
1944   // It is callee save so it survives the call to native
1945 
1946   const Register oop_handle_reg = r14;
1947 
1948   //
1949   // We immediately shuffle the arguments so that any vm call we have to
1950   // make from here on out (sync slow path, jvmti, etc.) we will have
1951   // captured the oops from our caller and have a valid oopMap for
1952   // them.
1953 
1954   // -----------------
1955   // The Grand Shuffle
1956 
1957   // The Java calling convention is either equal (linux) or denser (win64) than the
1958   // c calling convention. However the because of the jni_env argument the c calling
1959   // convention always has at least one more (and two for static) arguments than Java.
1960   // Therefore if we move the args from java -> c backwards then we will never have
1961   // a register->register conflict and we don't have to build a dependency graph
1962   // and figure out how to break any cycles.
1963   //
1964 
1965   // Record esp-based slot for receiver on stack for non-static methods
1966   int receiver_offset = -1;
1967 
1968   // This is a trick. We double the stack slots so we can claim
1969   // the oops in the caller's frame. Since we are sure to have
1970   // more args than the caller doubling is enough to make
1971   // sure we can capture all the incoming oop args from the
1972   // caller.
1973   //
1974   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1975 
1976   // Mark location of rbp (someday)
1977   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1978 
1979   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1980   // All inbound args are referenced based on rbp and all outbound args via rsp.
1981 
1982 
1983 #ifdef ASSERT
1984   bool reg_destroyed[Register::number_of_registers];
1985   bool freg_destroyed[XMMRegister::number_of_registers];
1986   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1987     reg_destroyed[r] = false;
1988   }
1989   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1990     freg_destroyed[f] = false;
1991   }
1992 
1993 #endif /* ASSERT */
1994 
1995   // For JNI natives the incoming and outgoing registers are offset upwards.
1996   GrowableArray<int> arg_order(2 * total_in_args);
1997 
1998   VMRegPair tmp_vmreg;
1999   tmp_vmreg.set2(rbx->as_VMReg());
2000 
2001   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2002     arg_order.push(i);
2003     arg_order.push(c_arg);
2004   }
2005 
2006   int temploc = -1;
2007   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2008     int i = arg_order.at(ai);
2009     int c_arg = arg_order.at(ai + 1);
2010     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2011 #ifdef ASSERT
2012     if (in_regs[i].first()->is_Register()) {
2013       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2014     } else if (in_regs[i].first()->is_XMMRegister()) {
2015       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2016     }
2017     if (out_regs[c_arg].first()->is_Register()) {
2018       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2019     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2020       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2021     }
2022 #endif /* ASSERT */
2023     switch (in_sig_bt[i]) {
2024       case T_ARRAY:
2025       case T_OBJECT:
2026         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2027                     ((i == 0) && (!is_static)),
2028                     &receiver_offset);
2029         break;
2030       case T_VOID:
2031         break;
2032 
2033       case T_FLOAT:
2034         __ float_move(in_regs[i], out_regs[c_arg]);
2035           break;
2036 
2037       case T_DOUBLE:
2038         assert( i + 1 < total_in_args &&
2039                 in_sig_bt[i + 1] == T_VOID &&
2040                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2041         __ double_move(in_regs[i], out_regs[c_arg]);
2042         break;
2043 
2044       case T_LONG :
2045         __ long_move(in_regs[i], out_regs[c_arg]);
2046         break;
2047 
2048       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2049 
2050       default:
2051         __ move32_64(in_regs[i], out_regs[c_arg]);
2052     }
2053   }
2054 
2055   int c_arg;
2056 
2057   // Pre-load a static method's oop into r14.  Used both by locking code and
2058   // the normal JNI call code.
2059   // point c_arg at the first arg that is already loaded in case we
2060   // need to spill before we call out
2061   c_arg = total_c_args - total_in_args;
2062 
2063   if (method->is_static()) {
2064 
2065     //  load oop into a register
2066     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2067 
2068     // Now handlize the static class mirror it's known not-null.
2069     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2070     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2071 
2072     // Now get the handle
2073     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2074     // store the klass handle as second argument
2075     __ movptr(c_rarg1, oop_handle_reg);
2076     // and protect the arg if we must spill
2077     c_arg--;
2078   }
2079 
2080   // Change state to native (we save the return address in the thread, since it might not
2081   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2082   // points into the right code segment. It does not have to be the correct return pc.
2083   // We use the same pc/oopMap repeatedly when we call out
2084 
2085   intptr_t the_pc = (intptr_t) __ pc();
2086   oop_maps->add_gc_map(the_pc - start, map);
2087 
2088   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2089 
2090 
2091   // We have all of the arguments setup at this point. We must not touch any register
2092   // argument registers at this point (what if we save/restore them there are no oop?
2093 
2094   {
2095     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2096     // protect the args we've loaded
2097     save_args(masm, total_c_args, c_arg, out_regs);
2098     __ mov_metadata(c_rarg1, method());
2099     __ call_VM_leaf(
2100       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2101       r15_thread, c_rarg1);
2102     restore_args(masm, total_c_args, c_arg, out_regs);
2103   }
2104 
2105   // RedefineClasses() tracing support for obsolete method entry
2106   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2107     // protect the args we've loaded
2108     save_args(masm, total_c_args, c_arg, out_regs);
2109     __ mov_metadata(c_rarg1, method());
2110     __ call_VM_leaf(
2111       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2112       r15_thread, c_rarg1);
2113     restore_args(masm, total_c_args, c_arg, out_regs);
2114   }
2115 
2116   // Lock a synchronized method
2117 
2118   // Register definitions used by locking and unlocking
2119 
2120   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2121   const Register obj_reg  = rbx;  // Will contain the oop
2122   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2123   const Register old_hdr  = r13;  // value of old header at unlock time
2124 
2125   Label slow_path_lock;
2126   Label lock_done;
2127 
2128   if (method->is_synchronized()) {
2129     Label count_mon;
2130 
2131     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2132 
2133     // Get the handle (the 2nd argument)
2134     __ mov(oop_handle_reg, c_rarg1);
2135 
2136     // Get address of the box
2137 
2138     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2139 
2140     // Load the oop from the handle
2141     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2142 
2143     if (LockingMode == LM_MONITOR) {
2144       __ jmp(slow_path_lock);
2145     } else if (LockingMode == LM_LEGACY) {
2146       // Load immediate 1 into swap_reg %rax
2147       __ movl(swap_reg, 1);
2148 
2149       // Load (object->mark() | 1) into swap_reg %rax
2150       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2151 
2152       // Save (object->mark() | 1) into BasicLock's displaced header
2153       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2154 
2155       // src -> dest iff dest == rax else rax <- dest
2156       __ lock();
2157       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2158       __ jcc(Assembler::equal, count_mon);
2159 
2160       // Hmm should this move to the slow path code area???
2161 
2162       // Test if the oopMark is an obvious stack pointer, i.e.,
2163       //  1) (mark & 3) == 0, and
2164       //  2) rsp <= mark < mark + os::pagesize()
2165       // These 3 tests can be done by evaluating the following
2166       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2167       // assuming both stack pointer and pagesize have their
2168       // least significant 2 bits clear.
2169       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2170 
2171       __ subptr(swap_reg, rsp);
2172       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2173 
2174       // Save the test result, for recursive case, the result is zero
2175       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2176       __ jcc(Assembler::notEqual, slow_path_lock);
2177     } else {
2178       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2179       __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2180     }
2181     __ jmp (lock_done);
2182 
2183     __ bind(count_mon);
2184     __ inc_held_monitor_count();
2185 
2186     // Slow path will re-enter here
2187     __ bind(lock_done);
2188   }
2189 
2190   // Finally just about ready to make the JNI call
2191 
2192   // get JNIEnv* which is first argument to native
2193   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2194 
2195   // Now set thread in native
2196   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2197 
2198   __ call(RuntimeAddress(native_func));
2199 
2200   // Verify or restore cpu control state after JNI call
2201   __ restore_cpu_control_state_after_jni(rscratch1);
2202 
2203   // Unpack native results.
2204   switch (ret_type) {
2205   case T_BOOLEAN: __ c2bool(rax);            break;
2206   case T_CHAR   : __ movzwl(rax, rax);      break;
2207   case T_BYTE   : __ sign_extend_byte (rax); break;
2208   case T_SHORT  : __ sign_extend_short(rax); break;
2209   case T_INT    : /* nothing to do */        break;
2210   case T_DOUBLE :
2211   case T_FLOAT  :
2212     // Result is in xmm0 we'll save as needed
2213     break;
2214   case T_ARRAY:                 // Really a handle
2215   case T_OBJECT:                // Really a handle
2216       break; // can't de-handlize until after safepoint check
2217   case T_VOID: break;
2218   case T_LONG: break;
2219   default       : ShouldNotReachHere();
2220   }
2221 
2222   Label after_transition;
2223 
2224   // Switch thread to "native transition" state before reading the synchronization state.
2225   // This additional state is necessary because reading and testing the synchronization
2226   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2227   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2228   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2229   //     Thread A is resumed to finish this native method, but doesn't block here since it
2230   //     didn't see any synchronization is progress, and escapes.
2231   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2232 
2233   // Force this write out before the read below
2234   if (!UseSystemMemoryBarrier) {
2235     __ membar(Assembler::Membar_mask_bits(
2236               Assembler::LoadLoad | Assembler::LoadStore |
2237               Assembler::StoreLoad | Assembler::StoreStore));
2238   }
2239 
2240   // check for safepoint operation in progress and/or pending suspend requests
2241   {
2242     Label Continue;
2243     Label slow_path;
2244 
2245     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2246 
2247     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2248     __ jcc(Assembler::equal, Continue);
2249     __ bind(slow_path);
2250 
2251     // Don't use call_VM as it will see a possible pending exception and forward it
2252     // and never return here preventing us from clearing _last_native_pc down below.
2253     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2254     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2255     // by hand.
2256     //
2257     __ vzeroupper();
2258     save_native_result(masm, ret_type, stack_slots);
2259     __ mov(c_rarg0, r15_thread);
2260     __ mov(r12, rsp); // remember sp
2261     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2262     __ andptr(rsp, -16); // align stack as required by ABI
2263     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2264     __ mov(rsp, r12); // restore sp
2265     __ reinit_heapbase();
2266     // Restore any method result value
2267     restore_native_result(masm, ret_type, stack_slots);
2268     __ bind(Continue);
2269   }
2270 
2271   // change thread state
2272   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2273   __ bind(after_transition);
2274 
2275   Label reguard;
2276   Label reguard_done;
2277   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2278   __ jcc(Assembler::equal, reguard);
2279   __ bind(reguard_done);
2280 
2281   // native result if any is live
2282 
2283   // Unlock
2284   Label slow_path_unlock;
2285   Label unlock_done;
2286   if (method->is_synchronized()) {
2287 
2288     Label fast_done;
2289 
2290     // Get locked oop from the handle we passed to jni
2291     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2292 
2293     if (LockingMode == LM_LEGACY) {
2294       Label not_recur;
2295       // Simple recursive lock?
2296       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2297       __ jcc(Assembler::notEqual, not_recur);
2298       __ jmpb(fast_done);
2299       __ bind(not_recur);
2300     }
2301 
2302     // Must save rax if it is live now because cmpxchg must use it
2303     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2304       save_native_result(masm, ret_type, stack_slots);
2305     }
2306 
2307     if (LockingMode == LM_MONITOR) {
2308       __ jmp(slow_path_unlock);
2309     } else if (LockingMode == LM_LEGACY) {
2310       // get address of the stack lock
2311       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2312       //  get old displaced header
2313       __ movptr(old_hdr, Address(rax, 0));
2314 
2315       // Atomic swap old header if oop still contains the stack lock
2316       __ lock();
2317       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2318       __ jcc(Assembler::notEqual, slow_path_unlock);
2319       __ dec_held_monitor_count();
2320     } else {
2321       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2322       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2323     }
2324 
2325     // slow path re-enters here
2326     __ bind(unlock_done);
2327     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2328       restore_native_result(masm, ret_type, stack_slots);
2329     }
2330 
2331     __ bind(fast_done);
2332   }
2333   {
2334     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2335     save_native_result(masm, ret_type, stack_slots);
2336     __ mov_metadata(c_rarg1, method());
2337     __ call_VM_leaf(
2338          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2339          r15_thread, c_rarg1);
2340     restore_native_result(masm, ret_type, stack_slots);
2341   }
2342 
2343   __ reset_last_Java_frame(false);
2344 
2345   // Unbox oop result, e.g. JNIHandles::resolve value.
2346   if (is_reference_type(ret_type)) {
2347     __ resolve_jobject(rax /* value */,
2348                        r15_thread /* thread */,
2349                        rcx /* tmp */);
2350   }
2351 
2352   if (CheckJNICalls) {
2353     // clear_pending_jni_exception_check
2354     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2355   }
2356 
2357   // reset handle block
2358   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2359   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2360 
2361   // pop our frame
2362 
2363   __ leave();
2364 
2365   // Any exception pending?
2366   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2367   __ jcc(Assembler::notEqual, exception_pending);
2368 
2369   // Return
2370 
2371   __ ret(0);
2372 
2373   // Unexpected paths are out of line and go here
2374 
2375   // forward the exception
2376   __ bind(exception_pending);
2377 
2378   // and forward the exception
2379   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2380 
2381   // Slow path locking & unlocking
2382   if (method->is_synchronized()) {
2383 
2384     // BEGIN Slow path lock
2385     __ bind(slow_path_lock);
2386 
2387     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2388     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2389 
2390     // protect the args we've loaded
2391     save_args(masm, total_c_args, c_arg, out_regs);
2392 
2393     __ mov(c_rarg0, obj_reg);
2394     __ mov(c_rarg1, lock_reg);
2395     __ mov(c_rarg2, r15_thread);
2396 
2397     // Not a leaf but we have last_Java_frame setup as we want
2398     // Force freeze slow path on ObjectMonitor::enter() for now which will fail with freeze_pinned_native.
2399     __ push_cont_fastpath();
2400     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2401     __ pop_cont_fastpath();
2402     restore_args(masm, total_c_args, c_arg, out_regs);
2403 
2404 #ifdef ASSERT
2405     { Label L;
2406     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2407     __ jcc(Assembler::equal, L);
2408     __ stop("no pending exception allowed on exit from monitorenter");
2409     __ bind(L);
2410     }
2411 #endif
2412     __ jmp(lock_done);
2413 
2414     // END Slow path lock
2415 
2416     // BEGIN Slow path unlock
2417     __ bind(slow_path_unlock);
2418 
2419     // If we haven't already saved the native result we must save it now as xmm registers
2420     // are still exposed.
2421     __ vzeroupper();
2422     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2423       save_native_result(masm, ret_type, stack_slots);
2424     }
2425 
2426     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2427 
2428     __ mov(c_rarg0, obj_reg);
2429     __ mov(c_rarg2, r15_thread);
2430     __ mov(r12, rsp); // remember sp
2431     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2432     __ andptr(rsp, -16); // align stack as required by ABI
2433 
2434     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2435     // NOTE that obj_reg == rbx currently
2436     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2437     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2438 
2439     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2440     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2441     __ mov(rsp, r12); // restore sp
2442     __ reinit_heapbase();
2443 #ifdef ASSERT
2444     {
2445       Label L;
2446       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2447       __ jcc(Assembler::equal, L);
2448       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2449       __ bind(L);
2450     }
2451 #endif /* ASSERT */
2452 
2453     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2454 
2455     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2456       restore_native_result(masm, ret_type, stack_slots);
2457     }
2458     __ jmp(unlock_done);
2459 
2460     // END Slow path unlock
2461 
2462   } // synchronized
2463 
2464   // SLOW PATH Reguard the stack if needed
2465 
2466   __ bind(reguard);
2467   __ vzeroupper();
2468   save_native_result(masm, ret_type, stack_slots);
2469   __ mov(r12, rsp); // remember sp
2470   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2471   __ andptr(rsp, -16); // align stack as required by ABI
2472   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2473   __ mov(rsp, r12); // restore sp
2474   __ reinit_heapbase();
2475   restore_native_result(masm, ret_type, stack_slots);
2476   // and continue
2477   __ jmp(reguard_done);
2478 
2479 
2480 
2481   __ flush();
2482 
2483   nmethod *nm = nmethod::new_native_nmethod(method,
2484                                             compile_id,
2485                                             masm->code(),
2486                                             vep_offset,
2487                                             frame_complete,
2488                                             stack_slots / VMRegImpl::slots_per_word,
2489                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2490                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2491                                             oop_maps);
2492 
2493   return nm;
2494 }
2495 
2496 // this function returns the adjust size (in number of words) to a c2i adapter
2497 // activation for use during deoptimization
2498 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2499   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2500 }
2501 
2502 
2503 uint SharedRuntime::out_preserve_stack_slots() {
2504   return 0;
2505 }
2506 
2507 
2508 // Number of stack slots between incoming argument block and the start of
2509 // a new frame.  The PROLOG must add this many slots to the stack.  The
2510 // EPILOG must remove this many slots.  amd64 needs two slots for
2511 // return address.
2512 uint SharedRuntime::in_preserve_stack_slots() {
2513   return 4 + 2 * VerifyStackAtCalls;
2514 }
2515 
2516 VMReg SharedRuntime::thread_register() {
2517   return r15_thread->as_VMReg();
2518 }
2519 
2520 //------------------------------generate_deopt_blob----------------------------
2521 void SharedRuntime::generate_deopt_blob() {
2522   // Allocate space for the code
2523   ResourceMark rm;
2524   // Setup code generation tools
2525   int pad = 0;
2526   if (UseAVX > 2) {
2527     pad += 1024;
2528   }
2529 #if INCLUDE_JVMCI
2530   if (EnableJVMCI) {
2531     pad += 512; // Increase the buffer size when compiling for JVMCI
2532   }
2533 #endif
2534   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2535   MacroAssembler* masm = new MacroAssembler(&buffer);
2536   int frame_size_in_words;
2537   OopMap* map = nullptr;
2538   OopMapSet *oop_maps = new OopMapSet();
2539 
2540   // -------------
2541   // This code enters when returning to a de-optimized nmethod.  A return
2542   // address has been pushed on the stack, and return values are in
2543   // registers.
2544   // If we are doing a normal deopt then we were called from the patched
2545   // nmethod from the point we returned to the nmethod. So the return
2546   // address on the stack is wrong by NativeCall::instruction_size
2547   // We will adjust the value so it looks like we have the original return
2548   // address on the stack (like when we eagerly deoptimized).
2549   // In the case of an exception pending when deoptimizing, we enter
2550   // with a return address on the stack that points after the call we patched
2551   // into the exception handler. We have the following register state from,
2552   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2553   //    rax: exception oop
2554   //    rbx: exception handler
2555   //    rdx: throwing pc
2556   // So in this case we simply jam rdx into the useless return address and
2557   // the stack looks just like we want.
2558   //
2559   // At this point we need to de-opt.  We save the argument return
2560   // registers.  We call the first C routine, fetch_unroll_info().  This
2561   // routine captures the return values and returns a structure which
2562   // describes the current frame size and the sizes of all replacement frames.
2563   // The current frame is compiled code and may contain many inlined
2564   // functions, each with their own JVM state.  We pop the current frame, then
2565   // push all the new frames.  Then we call the C routine unpack_frames() to
2566   // populate these frames.  Finally unpack_frames() returns us the new target
2567   // address.  Notice that callee-save registers are BLOWN here; they have
2568   // already been captured in the vframeArray at the time the return PC was
2569   // patched.
2570   address start = __ pc();
2571   Label cont;
2572 
2573   // Prolog for non exception case!
2574 
2575   // Save everything in sight.
2576   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2577 
2578   // Normal deoptimization.  Save exec mode for unpack_frames.
2579   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2580   __ jmp(cont);
2581 
2582   int reexecute_offset = __ pc() - start;
2583 #if INCLUDE_JVMCI && !defined(COMPILER1)
2584   if (EnableJVMCI && UseJVMCICompiler) {
2585     // JVMCI does not use this kind of deoptimization
2586     __ should_not_reach_here();
2587   }
2588 #endif
2589 
2590   // Reexecute case
2591   // return address is the pc describes what bci to do re-execute at
2592 
2593   // No need to update map as each call to save_live_registers will produce identical oopmap
2594   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2595 
2596   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2597   __ jmp(cont);
2598 
2599 #if INCLUDE_JVMCI
2600   Label after_fetch_unroll_info_call;
2601   int implicit_exception_uncommon_trap_offset = 0;
2602   int uncommon_trap_offset = 0;
2603 
2604   if (EnableJVMCI) {
2605     implicit_exception_uncommon_trap_offset = __ pc() - start;
2606 
2607     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2608     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2609 
2610     uncommon_trap_offset = __ pc() - start;
2611 
2612     // Save everything in sight.
2613     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2614     // fetch_unroll_info needs to call last_java_frame()
2615     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2616 
2617     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2618     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2619 
2620     __ movl(r14, Deoptimization::Unpack_reexecute);
2621     __ mov(c_rarg0, r15_thread);
2622     __ movl(c_rarg2, r14); // exec mode
2623     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2624     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2625 
2626     __ reset_last_Java_frame(false);
2627 
2628     __ jmp(after_fetch_unroll_info_call);
2629   } // EnableJVMCI
2630 #endif // INCLUDE_JVMCI
2631 
2632   int exception_offset = __ pc() - start;
2633 
2634   // Prolog for exception case
2635 
2636   // all registers are dead at this entry point, except for rax, and
2637   // rdx which contain the exception oop and exception pc
2638   // respectively.  Set them in TLS and fall thru to the
2639   // unpack_with_exception_in_tls entry point.
2640 
2641   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2642   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2643 
2644   int exception_in_tls_offset = __ pc() - start;
2645 
2646   // new implementation because exception oop is now passed in JavaThread
2647 
2648   // Prolog for exception case
2649   // All registers must be preserved because they might be used by LinearScan
2650   // Exceptiop oop and throwing PC are passed in JavaThread
2651   // tos: stack at point of call to method that threw the exception (i.e. only
2652   // args are on the stack, no return address)
2653 
2654   // make room on stack for the return address
2655   // It will be patched later with the throwing pc. The correct value is not
2656   // available now because loading it from memory would destroy registers.
2657   __ push(0);
2658 
2659   // Save everything in sight.
2660   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2661 
2662   // Now it is safe to overwrite any register
2663 
2664   // Deopt during an exception.  Save exec mode for unpack_frames.
2665   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2666 
2667   // load throwing pc from JavaThread and patch it as the return address
2668   // of the current frame. Then clear the field in JavaThread
2669 
2670   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2671   __ movptr(Address(rbp, wordSize), rdx);
2672   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2673 
2674 #ifdef ASSERT
2675   // verify that there is really an exception oop in JavaThread
2676   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2677   __ verify_oop(rax);
2678 
2679   // verify that there is no pending exception
2680   Label no_pending_exception;
2681   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2682   __ testptr(rax, rax);
2683   __ jcc(Assembler::zero, no_pending_exception);
2684   __ stop("must not have pending exception here");
2685   __ bind(no_pending_exception);
2686 #endif
2687 
2688   __ bind(cont);
2689 
2690   // Call C code.  Need thread and this frame, but NOT official VM entry
2691   // crud.  We cannot block on this call, no GC can happen.
2692   //
2693   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2694 
2695   // fetch_unroll_info needs to call last_java_frame().
2696 
2697   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2698 #ifdef ASSERT
2699   { Label L;
2700     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2701     __ jcc(Assembler::equal, L);
2702     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2703     __ bind(L);
2704   }
2705 #endif // ASSERT
2706   __ mov(c_rarg0, r15_thread);
2707   __ movl(c_rarg1, r14); // exec_mode
2708   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2709 
2710   // Need to have an oopmap that tells fetch_unroll_info where to
2711   // find any register it might need.
2712   oop_maps->add_gc_map(__ pc() - start, map);
2713 
2714   __ reset_last_Java_frame(false);
2715 
2716 #if INCLUDE_JVMCI
2717   if (EnableJVMCI) {
2718     __ bind(after_fetch_unroll_info_call);
2719   }
2720 #endif
2721 
2722   // Load UnrollBlock* into rdi
2723   __ mov(rdi, rax);
2724 
2725   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2726    Label noException;
2727   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2728   __ jcc(Assembler::notEqual, noException);
2729   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2730   // QQQ this is useless it was null above
2731   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2732   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2733   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2734 
2735   __ verify_oop(rax);
2736 
2737   // Overwrite the result registers with the exception results.
2738   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2739   // I think this is useless
2740   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2741 
2742   __ bind(noException);
2743 
2744   // Only register save data is on the stack.
2745   // Now restore the result registers.  Everything else is either dead
2746   // or captured in the vframeArray.
2747   RegisterSaver::restore_result_registers(masm);
2748 
2749   // All of the register save area has been popped of the stack. Only the
2750   // return address remains.
2751 
2752   // Pop all the frames we must move/replace.
2753   //
2754   // Frame picture (youngest to oldest)
2755   // 1: self-frame (no frame link)
2756   // 2: deopting frame  (no frame link)
2757   // 3: caller of deopting frame (could be compiled/interpreted).
2758   //
2759   // Note: by leaving the return address of self-frame on the stack
2760   // and using the size of frame 2 to adjust the stack
2761   // when we are done the return to frame 3 will still be on the stack.
2762 
2763   // Pop deoptimized frame
2764   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2765   __ addptr(rsp, rcx);
2766 
2767   // rsp should be pointing at the return address to the caller (3)
2768 
2769   // Pick up the initial fp we should save
2770   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2771   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2772 
2773 #ifdef ASSERT
2774   // Compilers generate code that bang the stack by as much as the
2775   // interpreter would need. So this stack banging should never
2776   // trigger a fault. Verify that it does not on non product builds.
2777   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2778   __ bang_stack_size(rbx, rcx);
2779 #endif
2780 
2781   // Load address of array of frame pcs into rcx
2782   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2783 
2784   // Trash the old pc
2785   __ addptr(rsp, wordSize);
2786 
2787   // Load address of array of frame sizes into rsi
2788   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2789 
2790   // Load counter into rdx
2791   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2792 
2793   // Now adjust the caller's stack to make up for the extra locals
2794   // but record the original sp so that we can save it in the skeletal interpreter
2795   // frame and the stack walking of interpreter_sender will get the unextended sp
2796   // value and not the "real" sp value.
2797 
2798   const Register sender_sp = r8;
2799 
2800   __ mov(sender_sp, rsp);
2801   __ movl(rbx, Address(rdi,
2802                        Deoptimization::UnrollBlock::
2803                        caller_adjustment_offset()));
2804   __ subptr(rsp, rbx);
2805 
2806   // Push interpreter frames in a loop
2807   Label loop;
2808   __ bind(loop);
2809   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2810   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2811   __ pushptr(Address(rcx, 0));          // Save return address
2812   __ enter();                           // Save old & set new ebp
2813   __ subptr(rsp, rbx);                  // Prolog
2814   // This value is corrected by layout_activation_impl
2815   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2816   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2817   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2818   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2819   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2820   __ decrementl(rdx);                   // Decrement counter
2821   __ jcc(Assembler::notZero, loop);
2822   __ pushptr(Address(rcx, 0));          // Save final return address
2823 
2824   // Re-push self-frame
2825   __ enter();                           // Save old & set new ebp
2826 
2827   // Allocate a full sized register save area.
2828   // Return address and rbp are in place, so we allocate two less words.
2829   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2830 
2831   // Restore frame locals after moving the frame
2832   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2833   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2834 
2835   // Call C code.  Need thread but NOT official VM entry
2836   // crud.  We cannot block on this call, no GC can happen.  Call should
2837   // restore return values to their stack-slots with the new SP.
2838   //
2839   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2840 
2841   // Use rbp because the frames look interpreted now
2842   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2843   // Don't need the precise return PC here, just precise enough to point into this code blob.
2844   address the_pc = __ pc();
2845   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2846 
2847   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2848   __ mov(c_rarg0, r15_thread);
2849   __ movl(c_rarg1, r14); // second arg: exec_mode
2850   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2851   // Revert SP alignment after call since we're going to do some SP relative addressing below
2852   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2853 
2854   // Set an oopmap for the call site
2855   // Use the same PC we used for the last java frame
2856   oop_maps->add_gc_map(the_pc - start,
2857                        new OopMap( frame_size_in_words, 0 ));
2858 
2859   // Clear fp AND pc
2860   __ reset_last_Java_frame(true);
2861 
2862   // Collect return values
2863   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2864   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2865   // I think this is useless (throwing pc?)
2866   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2867 
2868   // Pop self-frame.
2869   __ leave();                           // Epilog
2870 
2871   // Jump to interpreter
2872   __ ret(0);
2873 
2874   // Make sure all code is generated
2875   masm->flush();
2876 
2877   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2878   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2879 #if INCLUDE_JVMCI
2880   if (EnableJVMCI) {
2881     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2882     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2883   }
2884 #endif
2885 }
2886 
2887 #ifdef COMPILER2
2888 //------------------------------generate_uncommon_trap_blob--------------------
2889 void SharedRuntime::generate_uncommon_trap_blob() {
2890   // Allocate space for the code
2891   ResourceMark rm;
2892   // Setup code generation tools
2893   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2894   MacroAssembler* masm = new MacroAssembler(&buffer);
2895 
2896   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2897 
2898   address start = __ pc();
2899 
2900   if (UseRTMLocking) {
2901     // Abort RTM transaction before possible nmethod deoptimization.
2902     __ xabort(0);
2903   }
2904 
2905   // Push self-frame.  We get here with a return address on the
2906   // stack, so rsp is 8-byte aligned until we allocate our frame.
2907   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2908 
2909   // No callee saved registers. rbp is assumed implicitly saved
2910   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2911 
2912   // compiler left unloaded_class_index in j_rarg0 move to where the
2913   // runtime expects it.
2914   __ movl(c_rarg1, j_rarg0);
2915 
2916   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2917 
2918   // Call C code.  Need thread but NOT official VM entry
2919   // crud.  We cannot block on this call, no GC can happen.  Call should
2920   // capture callee-saved registers as well as return values.
2921   // Thread is in rdi already.
2922   //
2923   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2924 
2925   __ mov(c_rarg0, r15_thread);
2926   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2927   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2928 
2929   // Set an oopmap for the call site
2930   OopMapSet* oop_maps = new OopMapSet();
2931   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2932 
2933   // location of rbp is known implicitly by the frame sender code
2934 
2935   oop_maps->add_gc_map(__ pc() - start, map);
2936 
2937   __ reset_last_Java_frame(false);
2938 
2939   // Load UnrollBlock* into rdi
2940   __ mov(rdi, rax);
2941 
2942 #ifdef ASSERT
2943   { Label L;
2944     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
2945               Deoptimization::Unpack_uncommon_trap);
2946     __ jcc(Assembler::equal, L);
2947     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
2948     __ bind(L);
2949   }
2950 #endif
2951 
2952   // Pop all the frames we must move/replace.
2953   //
2954   // Frame picture (youngest to oldest)
2955   // 1: self-frame (no frame link)
2956   // 2: deopting frame  (no frame link)
2957   // 3: caller of deopting frame (could be compiled/interpreted).
2958 
2959   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2960   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2961 
2962   // Pop deoptimized frame (int)
2963   __ movl(rcx, Address(rdi,
2964                        Deoptimization::UnrollBlock::
2965                        size_of_deoptimized_frame_offset()));
2966   __ addptr(rsp, rcx);
2967 
2968   // rsp should be pointing at the return address to the caller (3)
2969 
2970   // Pick up the initial fp we should save
2971   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2972   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2973 
2974 #ifdef ASSERT
2975   // Compilers generate code that bang the stack by as much as the
2976   // interpreter would need. So this stack banging should never
2977   // trigger a fault. Verify that it does not on non product builds.
2978   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2979   __ bang_stack_size(rbx, rcx);
2980 #endif
2981 
2982   // Load address of array of frame pcs into rcx (address*)
2983   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2984 
2985   // Trash the return pc
2986   __ addptr(rsp, wordSize);
2987 
2988   // Load address of array of frame sizes into rsi (intptr_t*)
2989   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
2990 
2991   // Counter
2992   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
2993 
2994   // Now adjust the caller's stack to make up for the extra locals but
2995   // record the original sp so that we can save it in the skeletal
2996   // interpreter frame and the stack walking of interpreter_sender
2997   // will get the unextended sp value and not the "real" sp value.
2998 
2999   const Register sender_sp = r8;
3000 
3001   __ mov(sender_sp, rsp);
3002   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
3003   __ subptr(rsp, rbx);
3004 
3005   // Push interpreter frames in a loop
3006   Label loop;
3007   __ bind(loop);
3008   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3009   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3010   __ pushptr(Address(rcx, 0));     // Save return address
3011   __ enter();                      // Save old & set new rbp
3012   __ subptr(rsp, rbx);             // Prolog
3013   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3014             sender_sp);            // Make it walkable
3015   // This value is corrected by layout_activation_impl
3016   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3017   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3018   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3019   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3020   __ decrementl(rdx);              // Decrement counter
3021   __ jcc(Assembler::notZero, loop);
3022   __ pushptr(Address(rcx, 0));     // Save final return address
3023 
3024   // Re-push self-frame
3025   __ enter();                 // Save old & set new rbp
3026   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3027                               // Prolog
3028 
3029   // Use rbp because the frames look interpreted now
3030   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3031   // Don't need the precise return PC here, just precise enough to point into this code blob.
3032   address the_pc = __ pc();
3033   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3034 
3035   // Call C code.  Need thread but NOT official VM entry
3036   // crud.  We cannot block on this call, no GC can happen.  Call should
3037   // restore return values to their stack-slots with the new SP.
3038   // Thread is in rdi already.
3039   //
3040   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3041 
3042   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3043   __ mov(c_rarg0, r15_thread);
3044   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3045   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3046 
3047   // Set an oopmap for the call site
3048   // Use the same PC we used for the last java frame
3049   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3050 
3051   // Clear fp AND pc
3052   __ reset_last_Java_frame(true);
3053 
3054   // Pop self-frame.
3055   __ leave();                 // Epilog
3056 
3057   // Jump to interpreter
3058   __ ret(0);
3059 
3060   // Make sure all code is generated
3061   masm->flush();
3062 
3063   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3064                                                  SimpleRuntimeFrame::framesize >> 1);
3065 }
3066 #endif // COMPILER2
3067 
3068 //------------------------------generate_handler_blob------
3069 //
3070 // Generate a special Compile2Runtime blob that saves all registers,
3071 // and setup oopmap.
3072 //
3073 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3074   assert(StubRoutines::forward_exception_entry() != nullptr,
3075          "must be generated before");
3076 
3077   ResourceMark rm;
3078   OopMapSet *oop_maps = new OopMapSet();
3079   OopMap* map;
3080 
3081   // Allocate space for the code.  Setup code generation tools.
3082   CodeBuffer buffer("handler_blob", 2048, 1024);
3083   MacroAssembler* masm = new MacroAssembler(&buffer);
3084 
3085   address start   = __ pc();
3086   address call_pc = nullptr;
3087   int frame_size_in_words;
3088   bool cause_return = (poll_type == POLL_AT_RETURN);
3089   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3090 
3091   if (UseRTMLocking) {
3092     // Abort RTM transaction before calling runtime
3093     // because critical section will be large and will be
3094     // aborted anyway. Also nmethod could be deoptimized.
3095     __ xabort(0);
3096   }
3097 
3098   // Make room for return address (or push it again)
3099   if (!cause_return) {
3100     __ push(rbx);
3101   }
3102 
3103   // Save registers, fpu state, and flags
3104   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3105 
3106   // The following is basically a call_VM.  However, we need the precise
3107   // address of the call in order to generate an oopmap. Hence, we do all the
3108   // work ourselves.
3109 
3110   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3111 
3112   // The return address must always be correct so that frame constructor never
3113   // sees an invalid pc.
3114 
3115   if (!cause_return) {
3116     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3117     // Additionally, rbx is a callee saved register and we can look at it later to determine
3118     // if someone changed the return address for us!
3119     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3120     __ movptr(Address(rbp, wordSize), rbx);
3121   }
3122 
3123   // Do the call
3124   __ mov(c_rarg0, r15_thread);
3125   __ call(RuntimeAddress(call_ptr));
3126 
3127   // Set an oopmap for the call site.  This oopmap will map all
3128   // oop-registers and debug-info registers as callee-saved.  This
3129   // will allow deoptimization at this safepoint to find all possible
3130   // debug-info recordings, as well as let GC find all oops.
3131 
3132   oop_maps->add_gc_map( __ pc() - start, map);
3133 
3134   Label noException;
3135 
3136   __ reset_last_Java_frame(false);
3137 
3138   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3139   __ jcc(Assembler::equal, noException);
3140 
3141   // Exception pending
3142 
3143   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3144 
3145   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3146 
3147   // No exception case
3148   __ bind(noException);
3149 
3150   Label no_adjust;
3151 #ifdef ASSERT
3152   Label bail;
3153 #endif
3154   if (!cause_return) {
3155     Label no_prefix, not_special;
3156 
3157     // If our stashed return pc was modified by the runtime we avoid touching it
3158     __ cmpptr(rbx, Address(rbp, wordSize));
3159     __ jccb(Assembler::notEqual, no_adjust);
3160 
3161     // Skip over the poll instruction.
3162     // See NativeInstruction::is_safepoint_poll()
3163     // Possible encodings:
3164     //      85 00       test   %eax,(%rax)
3165     //      85 01       test   %eax,(%rcx)
3166     //      85 02       test   %eax,(%rdx)
3167     //      85 03       test   %eax,(%rbx)
3168     //      85 06       test   %eax,(%rsi)
3169     //      85 07       test   %eax,(%rdi)
3170     //
3171     //   41 85 00       test   %eax,(%r8)
3172     //   41 85 01       test   %eax,(%r9)
3173     //   41 85 02       test   %eax,(%r10)
3174     //   41 85 03       test   %eax,(%r11)
3175     //   41 85 06       test   %eax,(%r14)
3176     //   41 85 07       test   %eax,(%r15)
3177     //
3178     //      85 04 24    test   %eax,(%rsp)
3179     //   41 85 04 24    test   %eax,(%r12)
3180     //      85 45 00    test   %eax,0x0(%rbp)
3181     //   41 85 45 00    test   %eax,0x0(%r13)
3182 
3183     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3184     __ jcc(Assembler::notEqual, no_prefix);
3185     __ addptr(rbx, 1);
3186     __ bind(no_prefix);
3187 #ifdef ASSERT
3188     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3189 #endif
3190     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3191     // r12/rsp 0x04
3192     // r13/rbp 0x05
3193     __ movzbq(rcx, Address(rbx, 1));
3194     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3195     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3196     __ cmpptr(rcx, 1);
3197     __ jcc(Assembler::above, not_special);
3198     __ addptr(rbx, 1);
3199     __ bind(not_special);
3200 #ifdef ASSERT
3201     // Verify the correct encoding of the poll we're about to skip.
3202     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3203     __ jcc(Assembler::notEqual, bail);
3204     // Mask out the modrm bits
3205     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3206     // rax encodes to 0, so if the bits are nonzero it's incorrect
3207     __ jcc(Assembler::notZero, bail);
3208 #endif
3209     // Adjust return pc forward to step over the safepoint poll instruction
3210     __ addptr(rbx, 2);
3211     __ movptr(Address(rbp, wordSize), rbx);
3212   }
3213 
3214   __ bind(no_adjust);
3215   // Normal exit, restore registers and exit.
3216   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3217   __ ret(0);
3218 
3219 #ifdef ASSERT
3220   __ bind(bail);
3221   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3222 #endif
3223 
3224   // Make sure all code is generated
3225   masm->flush();
3226 
3227   // Fill-out other meta info
3228   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3229 }
3230 
3231 //
3232 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3233 //
3234 // Generate a stub that calls into vm to find out the proper destination
3235 // of a java call. All the argument registers are live at this point
3236 // but since this is generic code we don't know what they are and the caller
3237 // must do any gc of the args.
3238 //
3239 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3240   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3241 
3242   // allocate space for the code
3243   ResourceMark rm;
3244 
3245   CodeBuffer buffer(name, 1200, 512);
3246   MacroAssembler* masm = new MacroAssembler(&buffer);
3247 
3248   int frame_size_in_words;
3249 
3250   OopMapSet *oop_maps = new OopMapSet();
3251   OopMap* map = nullptr;
3252 
3253   int start = __ offset();
3254 
3255   // No need to save vector registers since they are caller-saved anyway.
3256   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3257 
3258   int frame_complete = __ offset();
3259 
3260   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3261 
3262   __ mov(c_rarg0, r15_thread);
3263 
3264   __ call(RuntimeAddress(destination));
3265 
3266 
3267   // Set an oopmap for the call site.
3268   // We need this not only for callee-saved registers, but also for volatile
3269   // registers that the compiler might be keeping live across a safepoint.
3270 
3271   oop_maps->add_gc_map( __ offset() - start, map);
3272 
3273   // rax contains the address we are going to jump to assuming no exception got installed
3274 
3275   // clear last_Java_sp
3276   __ reset_last_Java_frame(false);
3277   // check for pending exceptions
3278   Label pending;
3279   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3280   __ jcc(Assembler::notEqual, pending);
3281 
3282   // get the returned Method*
3283   __ get_vm_result_2(rbx, r15_thread);
3284   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3285 
3286   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3287 
3288   RegisterSaver::restore_live_registers(masm);
3289 
3290   // We are back to the original state on entry and ready to go.
3291 
3292   __ jmp(rax);
3293 
3294   // Pending exception after the safepoint
3295 
3296   __ bind(pending);
3297 
3298   RegisterSaver::restore_live_registers(masm);
3299 
3300   // exception pending => remove activation and forward to exception handler
3301 
3302   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3303 
3304   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3305   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3306 
3307   // -------------
3308   // make sure all code is generated
3309   masm->flush();
3310 
3311   // return the  blob
3312   // frame_size_words or bytes??
3313   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3314 }
3315 
3316 //------------------------------Montgomery multiplication------------------------
3317 //
3318 
3319 #ifndef _WINDOWS
3320 
3321 // Subtract 0:b from carry:a.  Return carry.
3322 static julong
3323 sub(julong a[], julong b[], julong carry, long len) {
3324   long long i = 0, cnt = len;
3325   julong tmp;
3326   asm volatile("clc; "
3327                "0: ; "
3328                "mov (%[b], %[i], 8), %[tmp]; "
3329                "sbb %[tmp], (%[a], %[i], 8); "
3330                "inc %[i]; dec %[cnt]; "
3331                "jne 0b; "
3332                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3333                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3334                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3335                : "memory");
3336   return tmp;
3337 }
3338 
3339 // Multiply (unsigned) Long A by Long B, accumulating the double-
3340 // length result into the accumulator formed of T0, T1, and T2.
3341 #define MACC(A, B, T0, T1, T2)                                  \
3342 do {                                                            \
3343   unsigned long hi, lo;                                         \
3344   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3345            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3346            : "r"(A), "a"(B) : "cc");                            \
3347  } while(0)
3348 
3349 // As above, but add twice the double-length result into the
3350 // accumulator.
3351 #define MACC2(A, B, T0, T1, T2)                                 \
3352 do {                                                            \
3353   unsigned long hi, lo;                                         \
3354   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3355            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3356            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3357            : "r"(A), "a"(B) : "cc");                            \
3358  } while(0)
3359 
3360 #else //_WINDOWS
3361 
3362 static julong
3363 sub(julong a[], julong b[], julong carry, long len) {
3364   long i;
3365   julong tmp;
3366   unsigned char c = 1;
3367   for (i = 0; i < len; i++) {
3368     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3369     a[i] = tmp;
3370   }
3371   c = _addcarry_u64(c, carry, ~0, &tmp);
3372   return tmp;
3373 }
3374 
3375 // Multiply (unsigned) Long A by Long B, accumulating the double-
3376 // length result into the accumulator formed of T0, T1, and T2.
3377 #define MACC(A, B, T0, T1, T2)                          \
3378 do {                                                    \
3379   julong hi, lo;                            \
3380   lo = _umul128(A, B, &hi);                             \
3381   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3382   c = _addcarry_u64(c, hi, T1, &T1);                    \
3383   _addcarry_u64(c, T2, 0, &T2);                         \
3384  } while(0)
3385 
3386 // As above, but add twice the double-length result into the
3387 // accumulator.
3388 #define MACC2(A, B, T0, T1, T2)                         \
3389 do {                                                    \
3390   julong hi, lo;                            \
3391   lo = _umul128(A, B, &hi);                             \
3392   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3393   c = _addcarry_u64(c, hi, T1, &T1);                    \
3394   _addcarry_u64(c, T2, 0, &T2);                         \
3395   c = _addcarry_u64(0, lo, T0, &T0);                    \
3396   c = _addcarry_u64(c, hi, T1, &T1);                    \
3397   _addcarry_u64(c, T2, 0, &T2);                         \
3398  } while(0)
3399 
3400 #endif //_WINDOWS
3401 
3402 // Fast Montgomery multiplication.  The derivation of the algorithm is
3403 // in  A Cryptographic Library for the Motorola DSP56000,
3404 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3405 
3406 static void NOINLINE
3407 montgomery_multiply(julong a[], julong b[], julong n[],
3408                     julong m[], julong inv, int len) {
3409   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3410   int i;
3411 
3412   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3413 
3414   for (i = 0; i < len; i++) {
3415     int j;
3416     for (j = 0; j < i; j++) {
3417       MACC(a[j], b[i-j], t0, t1, t2);
3418       MACC(m[j], n[i-j], t0, t1, t2);
3419     }
3420     MACC(a[i], b[0], t0, t1, t2);
3421     m[i] = t0 * inv;
3422     MACC(m[i], n[0], t0, t1, t2);
3423 
3424     assert(t0 == 0, "broken Montgomery multiply");
3425 
3426     t0 = t1; t1 = t2; t2 = 0;
3427   }
3428 
3429   for (i = len; i < 2*len; i++) {
3430     int j;
3431     for (j = i-len+1; j < len; j++) {
3432       MACC(a[j], b[i-j], t0, t1, t2);
3433       MACC(m[j], n[i-j], t0, t1, t2);
3434     }
3435     m[i-len] = t0;
3436     t0 = t1; t1 = t2; t2 = 0;
3437   }
3438 
3439   while (t0)
3440     t0 = sub(m, n, t0, len);
3441 }
3442 
3443 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3444 // multiplies so it should be up to 25% faster than Montgomery
3445 // multiplication.  However, its loop control is more complex and it
3446 // may actually run slower on some machines.
3447 
3448 static void NOINLINE
3449 montgomery_square(julong a[], julong n[],
3450                   julong m[], julong inv, int len) {
3451   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3452   int i;
3453 
3454   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3455 
3456   for (i = 0; i < len; i++) {
3457     int j;
3458     int end = (i+1)/2;
3459     for (j = 0; j < end; j++) {
3460       MACC2(a[j], a[i-j], t0, t1, t2);
3461       MACC(m[j], n[i-j], t0, t1, t2);
3462     }
3463     if ((i & 1) == 0) {
3464       MACC(a[j], a[j], t0, t1, t2);
3465     }
3466     for (; j < i; j++) {
3467       MACC(m[j], n[i-j], t0, t1, t2);
3468     }
3469     m[i] = t0 * inv;
3470     MACC(m[i], n[0], t0, t1, t2);
3471 
3472     assert(t0 == 0, "broken Montgomery square");
3473 
3474     t0 = t1; t1 = t2; t2 = 0;
3475   }
3476 
3477   for (i = len; i < 2*len; i++) {
3478     int start = i-len+1;
3479     int end = start + (len - start)/2;
3480     int j;
3481     for (j = start; j < end; j++) {
3482       MACC2(a[j], a[i-j], t0, t1, t2);
3483       MACC(m[j], n[i-j], t0, t1, t2);
3484     }
3485     if ((i & 1) == 0) {
3486       MACC(a[j], a[j], t0, t1, t2);
3487     }
3488     for (; j < len; j++) {
3489       MACC(m[j], n[i-j], t0, t1, t2);
3490     }
3491     m[i-len] = t0;
3492     t0 = t1; t1 = t2; t2 = 0;
3493   }
3494 
3495   while (t0)
3496     t0 = sub(m, n, t0, len);
3497 }
3498 
3499 // Swap words in a longword.
3500 static julong swap(julong x) {
3501   return (x << 32) | (x >> 32);
3502 }
3503 
3504 // Copy len longwords from s to d, word-swapping as we go.  The
3505 // destination array is reversed.
3506 static void reverse_words(julong *s, julong *d, int len) {
3507   d += len;
3508   while(len-- > 0) {
3509     d--;
3510     *d = swap(*s);
3511     s++;
3512   }
3513 }
3514 
3515 // The threshold at which squaring is advantageous was determined
3516 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3517 #define MONTGOMERY_SQUARING_THRESHOLD 64
3518 
3519 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3520                                         jint len, jlong inv,
3521                                         jint *m_ints) {
3522   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3523   int longwords = len/2;
3524 
3525   // Make very sure we don't use so much space that the stack might
3526   // overflow.  512 jints corresponds to an 16384-bit integer and
3527   // will use here a total of 8k bytes of stack space.
3528   int divisor = sizeof(julong) * 4;
3529   guarantee(longwords <= 8192 / divisor, "must be");
3530   int total_allocation = longwords * sizeof (julong) * 4;
3531   julong *scratch = (julong *)alloca(total_allocation);
3532 
3533   // Local scratch arrays
3534   julong
3535     *a = scratch + 0 * longwords,
3536     *b = scratch + 1 * longwords,
3537     *n = scratch + 2 * longwords,
3538     *m = scratch + 3 * longwords;
3539 
3540   reverse_words((julong *)a_ints, a, longwords);
3541   reverse_words((julong *)b_ints, b, longwords);
3542   reverse_words((julong *)n_ints, n, longwords);
3543 
3544   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3545 
3546   reverse_words(m, (julong *)m_ints, longwords);
3547 }
3548 
3549 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3550                                       jint len, jlong inv,
3551                                       jint *m_ints) {
3552   assert(len % 2 == 0, "array length in montgomery_square must be even");
3553   int longwords = len/2;
3554 
3555   // Make very sure we don't use so much space that the stack might
3556   // overflow.  512 jints corresponds to an 16384-bit integer and
3557   // will use here a total of 6k bytes of stack space.
3558   int divisor = sizeof(julong) * 3;
3559   guarantee(longwords <= (8192 / divisor), "must be");
3560   int total_allocation = longwords * sizeof (julong) * 3;
3561   julong *scratch = (julong *)alloca(total_allocation);
3562 
3563   // Local scratch arrays
3564   julong
3565     *a = scratch + 0 * longwords,
3566     *n = scratch + 1 * longwords,
3567     *m = scratch + 2 * longwords;
3568 
3569   reverse_words((julong *)a_ints, a, longwords);
3570   reverse_words((julong *)n_ints, n, longwords);
3571 
3572   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3573     ::montgomery_square(a, n, m, (julong)inv, longwords);
3574   } else {
3575     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3576   }
3577 
3578   reverse_words(m, (julong *)m_ints, longwords);
3579 }
3580 
3581 #ifdef COMPILER2
3582 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3583 //
3584 //------------------------------generate_exception_blob---------------------------
3585 // creates exception blob at the end
3586 // Using exception blob, this code is jumped from a compiled method.
3587 // (see emit_exception_handler in x86_64.ad file)
3588 //
3589 // Given an exception pc at a call we call into the runtime for the
3590 // handler in this method. This handler might merely restore state
3591 // (i.e. callee save registers) unwind the frame and jump to the
3592 // exception handler for the nmethod if there is no Java level handler
3593 // for the nmethod.
3594 //
3595 // This code is entered with a jmp.
3596 //
3597 // Arguments:
3598 //   rax: exception oop
3599 //   rdx: exception pc
3600 //
3601 // Results:
3602 //   rax: exception oop
3603 //   rdx: exception pc in caller or ???
3604 //   destination: exception handler of caller
3605 //
3606 // Note: the exception pc MUST be at a call (precise debug information)
3607 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3608 //
3609 
3610 void OptoRuntime::generate_exception_blob() {
3611   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3612   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3613   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3614 
3615   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3616 
3617   // Allocate space for the code
3618   ResourceMark rm;
3619   // Setup code generation tools
3620   CodeBuffer buffer("exception_blob", 2048, 1024);
3621   MacroAssembler* masm = new MacroAssembler(&buffer);
3622 
3623 
3624   address start = __ pc();
3625 
3626   // Exception pc is 'return address' for stack walker
3627   __ push(rdx);
3628   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3629 
3630   // Save callee-saved registers.  See x86_64.ad.
3631 
3632   // rbp is an implicitly saved callee saved register (i.e., the calling
3633   // convention will save/restore it in the prolog/epilog). Other than that
3634   // there are no callee save registers now that adapter frames are gone.
3635 
3636   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3637 
3638   // Store exception in Thread object. We cannot pass any arguments to the
3639   // handle_exception call, since we do not want to make any assumption
3640   // about the size of the frame where the exception happened in.
3641   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3642   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3643   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3644 
3645   // This call does all the hard work.  It checks if an exception handler
3646   // exists in the method.
3647   // If so, it returns the handler address.
3648   // If not, it prepares for stack-unwinding, restoring the callee-save
3649   // registers of the frame being removed.
3650   //
3651   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3652 
3653   // At a method handle call, the stack may not be properly aligned
3654   // when returning with an exception.
3655   address the_pc = __ pc();
3656   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3657   __ mov(c_rarg0, r15_thread);
3658   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3659   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3660 
3661   // Set an oopmap for the call site.  This oopmap will only be used if we
3662   // are unwinding the stack.  Hence, all locations will be dead.
3663   // Callee-saved registers will be the same as the frame above (i.e.,
3664   // handle_exception_stub), since they were restored when we got the
3665   // exception.
3666 
3667   OopMapSet* oop_maps = new OopMapSet();
3668 
3669   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3670 
3671   __ reset_last_Java_frame(false);
3672 
3673   // Restore callee-saved registers
3674 
3675   // rbp is an implicitly saved callee-saved register (i.e., the calling
3676   // convention will save restore it in prolog/epilog) Other than that
3677   // there are no callee save registers now that adapter frames are gone.
3678 
3679   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3680 
3681   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3682   __ pop(rdx);                  // No need for exception pc anymore
3683 
3684   // rax: exception handler
3685 
3686   // We have a handler in rax (could be deopt blob).
3687   __ mov(r8, rax);
3688 
3689   // Get the exception oop
3690   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3691   // Get the exception pc in case we are deoptimized
3692   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3693 #ifdef ASSERT
3694   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3695   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3696 #endif
3697   // Clear the exception oop so GC no longer processes it as a root.
3698   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3699 
3700   // rax: exception oop
3701   // r8:  exception handler
3702   // rdx: exception pc
3703   // Jump to handler
3704 
3705   __ jmp(r8);
3706 
3707   // Make sure all code is generated
3708   masm->flush();
3709 
3710   // Set exception blob
3711   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3712 }
3713 #endif // COMPILER2
3714