1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/SCCache.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "oops/method.inline.hpp"
  47 #include "prims/methodHandles.hpp"
  48 #include "runtime/continuation.hpp"
  49 #include "runtime/continuationEntry.inline.hpp"
  50 #include "runtime/globals.hpp"
  51 #include "runtime/jniHandles.hpp"
  52 #include "runtime/safepointMechanism.hpp"
  53 #include "runtime/sharedRuntime.hpp"
  54 #include "runtime/signature.hpp"
  55 #include "runtime/stubRoutines.hpp"
  56 #include "runtime/vframeArray.hpp"
  57 #include "runtime/vm_version.hpp"
  58 #include "utilities/align.hpp"
  59 #include "utilities/checkedCast.hpp"
  60 #include "utilities/formatBuffer.hpp"
  61 #include "vmreg_x86.inline.hpp"
  62 #ifdef COMPILER1
  63 #include "c1/c1_Runtime1.hpp"
  64 #endif
  65 #ifdef COMPILER2
  66 #include "opto/runtime.hpp"
  67 #endif
  68 #if INCLUDE_JVMCI
  69 #include "jvmci/jvmciJavaClasses.hpp"
  70 #endif
  71 
  72 #define __ masm->
  73 
  74 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  75 
  76 class SimpleRuntimeFrame {
  77 
  78   public:
  79 
  80   // Most of the runtime stubs have this simple frame layout.
  81   // This class exists to make the layout shared in one place.
  82   // Offsets are for compiler stack slots, which are jints.
  83   enum layout {
  84     // The frame sender code expects that rbp will be in the "natural" place and
  85     // will override any oopMap setting for it. We must therefore force the layout
  86     // so that it agrees with the frame sender code.
  87     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  88     rbp_off2,
  89     return_off, return_off2,
  90     framesize
  91   };
  92 };
  93 
  94 class RegisterSaver {
  95   // Capture info about frame layout.  Layout offsets are in jint
  96   // units because compiler frame slots are jints.
  97 #define XSAVE_AREA_BEGIN 160
  98 #define XSAVE_AREA_YMM_BEGIN 576
  99 #define XSAVE_AREA_OPMASK_BEGIN 1088
 100 #define XSAVE_AREA_ZMM_BEGIN 1152
 101 #define XSAVE_AREA_UPPERBANK 1664
 102 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 103 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 104 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 105 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 107   enum layout {
 108     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 109     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 110     DEF_XMM_OFFS(0),
 111     DEF_XMM_OFFS(1),
 112     // 2..15 are implied in range usage
 113     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 114     DEF_YMM_OFFS(0),
 115     DEF_YMM_OFFS(1),
 116     // 2..15 are implied in range usage
 117     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 118     DEF_OPMASK_OFFS(0),
 119     DEF_OPMASK_OFFS(1),
 120     // 2..7 are implied in range usage
 121     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 122     DEF_ZMM_OFFS(0),
 123     DEF_ZMM_OFFS(1),
 124     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 125     DEF_ZMM_UPPER_OFFS(16),
 126     DEF_ZMM_UPPER_OFFS(17),
 127     // 18..31 are implied in range usage
 128     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 129     fpu_stateH_end,
 130     r15_off, r15H_off,
 131     r14_off, r14H_off,
 132     r13_off, r13H_off,
 133     r12_off, r12H_off,
 134     r11_off, r11H_off,
 135     r10_off, r10H_off,
 136     r9_off,  r9H_off,
 137     r8_off,  r8H_off,
 138     rdi_off, rdiH_off,
 139     rsi_off, rsiH_off,
 140     ignore_off, ignoreH_off,  // extra copy of rbp
 141     rsp_off, rspH_off,
 142     rbx_off, rbxH_off,
 143     rdx_off, rdxH_off,
 144     rcx_off, rcxH_off,
 145     rax_off, raxH_off,
 146     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 147     align_off, alignH_off,
 148     flags_off, flagsH_off,
 149     // The frame sender code expects that rbp will be in the "natural" place and
 150     // will override any oopMap setting for it. We must therefore force the layout
 151     // so that it agrees with the frame sender code.
 152     rbp_off, rbpH_off,        // copy of rbp we will restore
 153     return_off, returnH_off,  // slot for return address
 154     reg_save_size             // size in compiler stack slots
 155   };
 156 
 157  public:
 158   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 159   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 160 
 161   // Offsets into the register save area
 162   // Used by deoptimization when it is managing result register
 163   // values on its own
 164 
 165   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 166   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 167   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 168   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 169   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 170 
 171   // During deoptimization only the result registers need to be restored,
 172   // all the other values have already been extracted.
 173   static void restore_result_registers(MacroAssembler* masm);
 174 };
 175 
 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 177   int off = 0;
 178   int num_xmm_regs = XMMRegister::available_xmm_registers();
 179 #if COMPILER2_OR_JVMCI
 180   if (save_wide_vectors && UseAVX == 0) {
 181     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 182   }
 183   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 184 #else
 185   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 186 #endif
 187 
 188   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 189   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 190   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 191   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 192   // CodeBlob frame size is in words.
 193   int frame_size_in_words = frame_size_in_bytes / wordSize;
 194   *total_frame_words = frame_size_in_words;
 195 
 196   // Save registers, fpu state, and flags.
 197   // We assume caller has already pushed the return address onto the
 198   // stack, so rsp is 8-byte aligned here.
 199   // We push rpb twice in this sequence because we want the real rbp
 200   // to be under the return like a normal enter.
 201 
 202   __ enter();          // rsp becomes 16-byte aligned here
 203   __ push_CPU_state(); // Push a multiple of 16 bytes
 204 
 205   // push cpu state handles this on EVEX enabled targets
 206   if (save_wide_vectors) {
 207     // Save upper half of YMM registers(0..15)
 208     int base_addr = XSAVE_AREA_YMM_BEGIN;
 209     for (int n = 0; n < 16; n++) {
 210       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 211     }
 212     if (VM_Version::supports_evex()) {
 213       // Save upper half of ZMM registers(0..15)
 214       base_addr = XSAVE_AREA_ZMM_BEGIN;
 215       for (int n = 0; n < 16; n++) {
 216         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 217       }
 218       // Save full ZMM registers(16..num_xmm_regs)
 219       base_addr = XSAVE_AREA_UPPERBANK;
 220       off = 0;
 221       int vector_len = Assembler::AVX_512bit;
 222       for (int n = 16; n < num_xmm_regs; n++) {
 223         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 224       }
 225 #if COMPILER2_OR_JVMCI
 226       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 227       off = 0;
 228       for(int n = 0; n < KRegister::number_of_registers; n++) {
 229         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 230       }
 231 #endif
 232     }
 233   } else {
 234     if (VM_Version::supports_evex()) {
 235       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 236       int base_addr = XSAVE_AREA_UPPERBANK;
 237       off = 0;
 238       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 239       for (int n = 16; n < num_xmm_regs; n++) {
 240         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 241       }
 242 #if COMPILER2_OR_JVMCI
 243       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 244       off = 0;
 245       for(int n = 0; n < KRegister::number_of_registers; n++) {
 246         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 247       }
 248 #endif
 249     }
 250   }
 251   __ vzeroupper();
 252   if (frame::arg_reg_save_area_bytes != 0) {
 253     // Allocate argument register save area
 254     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 255   }
 256 
 257   // Set an oopmap for the call site.  This oopmap will map all
 258   // oop-registers and debug-info registers as callee-saved.  This
 259   // will allow deoptimization at this safepoint to find all possible
 260   // debug-info recordings, as well as let GC find all oops.
 261 
 262   OopMapSet *oop_maps = new OopMapSet();
 263   OopMap* map = new OopMap(frame_size_in_slots, 0);
 264 
 265 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 266 
 267   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 271   // rbp location is known implicitly by the frame sender code, needs no oopmap
 272   // and the location where rbp was saved by is ignored
 273   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 283   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 284   // on EVEX enabled targets, we get it included in the xsave area
 285   off = xmm0_off;
 286   int delta = xmm1_off - off;
 287   for (int n = 0; n < 16; n++) {
 288     XMMRegister xmm_name = as_XMMRegister(n);
 289     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 290     off += delta;
 291   }
 292   if (UseAVX > 2) {
 293     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 294     off = zmm16_off;
 295     delta = zmm17_off - off;
 296     for (int n = 16; n < num_xmm_regs; n++) {
 297       XMMRegister zmm_name = as_XMMRegister(n);
 298       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 299       off += delta;
 300     }
 301   }
 302 
 303 #if COMPILER2_OR_JVMCI
 304   if (save_wide_vectors) {
 305     // Save upper half of YMM registers(0..15)
 306     off = ymm0_off;
 307     delta = ymm1_off - ymm0_off;
 308     for (int n = 0; n < 16; n++) {
 309       XMMRegister ymm_name = as_XMMRegister(n);
 310       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 311       off += delta;
 312     }
 313     if (VM_Version::supports_evex()) {
 314       // Save upper half of ZMM registers(0..15)
 315       off = zmm0_off;
 316       delta = zmm1_off - zmm0_off;
 317       for (int n = 0; n < 16; n++) {
 318         XMMRegister zmm_name = as_XMMRegister(n);
 319         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 320         off += delta;
 321       }
 322     }
 323   }
 324 #endif // COMPILER2_OR_JVMCI
 325 
 326   // %%% These should all be a waste but we'll keep things as they were for now
 327   if (true) {
 328     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 332     // rbp location is known implicitly by the frame sender code, needs no oopmap
 333     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 343     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 344     // on EVEX enabled targets, we get it included in the xsave area
 345     off = xmm0H_off;
 346     delta = xmm1H_off - off;
 347     for (int n = 0; n < 16; n++) {
 348       XMMRegister xmm_name = as_XMMRegister(n);
 349       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 350       off += delta;
 351     }
 352     if (UseAVX > 2) {
 353       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 354       off = zmm16H_off;
 355       delta = zmm17H_off - off;
 356       for (int n = 16; n < num_xmm_regs; n++) {
 357         XMMRegister zmm_name = as_XMMRegister(n);
 358         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 359         off += delta;
 360       }
 361     }
 362   }
 363 
 364   return map;
 365 }
 366 
 367 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 368   int num_xmm_regs = XMMRegister::available_xmm_registers();
 369   if (frame::arg_reg_save_area_bytes != 0) {
 370     // Pop arg register save area
 371     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 372   }
 373 
 374 #if COMPILER2_OR_JVMCI
 375   if (restore_wide_vectors) {
 376     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 377     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 378   }
 379 #else
 380   assert(!restore_wide_vectors, "vectors are generated only by C2");
 381 #endif
 382 
 383   __ vzeroupper();
 384 
 385   // On EVEX enabled targets everything is handled in pop fpu state
 386   if (restore_wide_vectors) {
 387     // Restore upper half of YMM registers (0..15)
 388     int base_addr = XSAVE_AREA_YMM_BEGIN;
 389     for (int n = 0; n < 16; n++) {
 390       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 391     }
 392     if (VM_Version::supports_evex()) {
 393       // Restore upper half of ZMM registers (0..15)
 394       base_addr = XSAVE_AREA_ZMM_BEGIN;
 395       for (int n = 0; n < 16; n++) {
 396         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 397       }
 398       // Restore full ZMM registers(16..num_xmm_regs)
 399       base_addr = XSAVE_AREA_UPPERBANK;
 400       int vector_len = Assembler::AVX_512bit;
 401       int off = 0;
 402       for (int n = 16; n < num_xmm_regs; n++) {
 403         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 404       }
 405 #if COMPILER2_OR_JVMCI
 406       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 407       off = 0;
 408       for (int n = 0; n < KRegister::number_of_registers; n++) {
 409         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 410       }
 411 #endif
 412     }
 413   } else {
 414     if (VM_Version::supports_evex()) {
 415       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 416       int base_addr = XSAVE_AREA_UPPERBANK;
 417       int off = 0;
 418       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 419       for (int n = 16; n < num_xmm_regs; n++) {
 420         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 421       }
 422 #if COMPILER2_OR_JVMCI
 423       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 424       off = 0;
 425       for (int n = 0; n < KRegister::number_of_registers; n++) {
 426         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 427       }
 428 #endif
 429     }
 430   }
 431 
 432   // Recover CPU state
 433   __ pop_CPU_state();
 434   // Get the rbp described implicitly by the calling convention (no oopMap)
 435   __ pop(rbp);
 436 }
 437 
 438 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 439 
 440   // Just restore result register. Only used by deoptimization. By
 441   // now any callee save register that needs to be restored to a c2
 442   // caller of the deoptee has been extracted into the vframeArray
 443   // and will be stuffed into the c2i adapter we create for later
 444   // restoration so only result registers need to be restored here.
 445 
 446   // Restore fp result register
 447   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 448   // Restore integer result register
 449   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 450   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 451 
 452   // Pop all of the register save are off the stack except the return address
 453   __ addptr(rsp, return_offset_in_bytes());
 454 }
 455 
 456 // Is vector's size (in bytes) bigger than a size saved by default?
 457 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 458 bool SharedRuntime::is_wide_vector(int size) {
 459   return size > 16;
 460 }
 461 
 462 // ---------------------------------------------------------------------------
 463 // Read the array of BasicTypes from a signature, and compute where the
 464 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 465 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 466 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 467 // as framesizes are fixed.
 468 // VMRegImpl::stack0 refers to the first slot 0(sp).
 469 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 470 // Register up to Register::number_of_registers are the 64-bit
 471 // integer registers.
 472 
 473 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 474 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 475 // units regardless of build. Of course for i486 there is no 64 bit build
 476 
 477 // The Java calling convention is a "shifted" version of the C ABI.
 478 // By skipping the first C ABI register we can call non-static jni methods
 479 // with small numbers of arguments without having to shuffle the arguments
 480 // at all. Since we control the java ABI we ought to at least get some
 481 // advantage out of it.
 482 
 483 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 484                                            VMRegPair *regs,
 485                                            int total_args_passed) {
 486 
 487   // Create the mapping between argument positions and
 488   // registers.
 489   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 490     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 491   };
 492   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 493     j_farg0, j_farg1, j_farg2, j_farg3,
 494     j_farg4, j_farg5, j_farg6, j_farg7
 495   };
 496 
 497 
 498   uint int_args = 0;
 499   uint fp_args = 0;
 500   uint stk_args = 0;
 501 
 502   for (int i = 0; i < total_args_passed; i++) {
 503     switch (sig_bt[i]) {
 504     case T_BOOLEAN:
 505     case T_CHAR:
 506     case T_BYTE:
 507     case T_SHORT:
 508     case T_INT:
 509       if (int_args < Argument::n_int_register_parameters_j) {
 510         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 511       } else {
 512         stk_args = align_up(stk_args, 2);
 513         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 514         stk_args += 1;
 515       }
 516       break;
 517     case T_VOID:
 518       // halves of T_LONG or T_DOUBLE
 519       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 520       regs[i].set_bad();
 521       break;
 522     case T_LONG:
 523       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 524       // fall through
 525     case T_OBJECT:
 526     case T_ARRAY:
 527     case T_ADDRESS:
 528       if (int_args < Argument::n_int_register_parameters_j) {
 529         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 530       } else {
 531         stk_args = align_up(stk_args, 2);
 532         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 533         stk_args += 2;
 534       }
 535       break;
 536     case T_FLOAT:
 537       if (fp_args < Argument::n_float_register_parameters_j) {
 538         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 539       } else {
 540         stk_args = align_up(stk_args, 2);
 541         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 542         stk_args += 1;
 543       }
 544       break;
 545     case T_DOUBLE:
 546       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 547       if (fp_args < Argument::n_float_register_parameters_j) {
 548         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 549       } else {
 550         stk_args = align_up(stk_args, 2);
 551         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 552         stk_args += 2;
 553       }
 554       break;
 555     default:
 556       ShouldNotReachHere();
 557       break;
 558     }
 559   }
 560 
 561   return stk_args;
 562 }
 563 
 564 // Patch the callers callsite with entry to compiled code if it exists.
 565 static void patch_callers_callsite(MacroAssembler *masm) {
 566   Label L;
 567   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 568   __ jcc(Assembler::equal, L);
 569 
 570   // Save the current stack pointer
 571   __ mov(r13, rsp);
 572   // Schedule the branch target address early.
 573   // Call into the VM to patch the caller, then jump to compiled callee
 574   // rax isn't live so capture return address while we easily can
 575   __ movptr(rax, Address(rsp, 0));
 576 
 577   // align stack so push_CPU_state doesn't fault
 578   __ andptr(rsp, -(StackAlignmentInBytes));
 579   __ push_CPU_state();
 580   __ vzeroupper();
 581   // VM needs caller's callsite
 582   // VM needs target method
 583   // This needs to be a long call since we will relocate this adapter to
 584   // the codeBuffer and it may not reach
 585 
 586   // Allocate argument register save area
 587   if (frame::arg_reg_save_area_bytes != 0) {
 588     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 589   }
 590   __ mov(c_rarg0, rbx);
 591   __ mov(c_rarg1, rax);
 592   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 593 
 594   // De-allocate argument register save area
 595   if (frame::arg_reg_save_area_bytes != 0) {
 596     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 597   }
 598 
 599   __ vzeroupper();
 600   __ pop_CPU_state();
 601   // restore sp
 602   __ mov(rsp, r13);
 603   __ bind(L);
 604 }
 605 
 606 
 607 static void gen_c2i_adapter(MacroAssembler *masm,
 608                             int total_args_passed,
 609                             int comp_args_on_stack,
 610                             const BasicType *sig_bt,
 611                             const VMRegPair *regs,
 612                             Label& skip_fixup) {
 613   // Before we get into the guts of the C2I adapter, see if we should be here
 614   // at all.  We've come from compiled code and are attempting to jump to the
 615   // interpreter, which means the caller made a static call to get here
 616   // (vcalls always get a compiled target if there is one).  Check for a
 617   // compiled target.  If there is one, we need to patch the caller's call.
 618   patch_callers_callsite(masm);
 619 
 620   __ bind(skip_fixup);
 621 
 622   // Since all args are passed on the stack, total_args_passed *
 623   // Interpreter::stackElementSize is the space we need.
 624 
 625   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 626 
 627   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 628 
 629   // stack is aligned, keep it that way
 630   // This is not currently needed or enforced by the interpreter, but
 631   // we might as well conform to the ABI.
 632   extraspace = align_up(extraspace, 2*wordSize);
 633 
 634   // set senderSP value
 635   __ lea(r13, Address(rsp, wordSize));
 636 
 637 #ifdef ASSERT
 638   __ check_stack_alignment(r13, "sender stack not aligned");
 639 #endif
 640   if (extraspace > 0) {
 641     // Pop the return address
 642     __ pop(rax);
 643 
 644     __ subptr(rsp, extraspace);
 645 
 646     // Push the return address
 647     __ push(rax);
 648 
 649     // Account for the return address location since we store it first rather
 650     // than hold it in a register across all the shuffling
 651     extraspace += wordSize;
 652   }
 653 
 654 #ifdef ASSERT
 655   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 656 #endif
 657 
 658   // Now write the args into the outgoing interpreter space
 659   for (int i = 0; i < total_args_passed; i++) {
 660     if (sig_bt[i] == T_VOID) {
 661       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 662       continue;
 663     }
 664 
 665     // offset to start parameters
 666     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 667     int next_off = st_off - Interpreter::stackElementSize;
 668 
 669     // Say 4 args:
 670     // i   st_off
 671     // 0   32 T_LONG
 672     // 1   24 T_VOID
 673     // 2   16 T_OBJECT
 674     // 3    8 T_BOOL
 675     // -    0 return address
 676     //
 677     // However to make thing extra confusing. Because we can fit a long/double in
 678     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 679     // leaves one slot empty and only stores to a single slot. In this case the
 680     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 681 
 682     VMReg r_1 = regs[i].first();
 683     VMReg r_2 = regs[i].second();
 684     if (!r_1->is_valid()) {
 685       assert(!r_2->is_valid(), "");
 686       continue;
 687     }
 688     if (r_1->is_stack()) {
 689       // memory to memory use rax
 690       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 691       if (!r_2->is_valid()) {
 692         // sign extend??
 693         __ movl(rax, Address(rsp, ld_off));
 694         __ movptr(Address(rsp, st_off), rax);
 695 
 696       } else {
 697 
 698         __ movq(rax, Address(rsp, ld_off));
 699 
 700         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 701         // T_DOUBLE and T_LONG use two slots in the interpreter
 702         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 703           // ld_off == LSW, ld_off+wordSize == MSW
 704           // st_off == MSW, next_off == LSW
 705           __ movq(Address(rsp, next_off), rax);
 706 #ifdef ASSERT
 707           // Overwrite the unused slot with known junk
 708           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 709           __ movptr(Address(rsp, st_off), rax);
 710 #endif /* ASSERT */
 711         } else {
 712           __ movq(Address(rsp, st_off), rax);
 713         }
 714       }
 715     } else if (r_1->is_Register()) {
 716       Register r = r_1->as_Register();
 717       if (!r_2->is_valid()) {
 718         // must be only an int (or less ) so move only 32bits to slot
 719         // why not sign extend??
 720         __ movl(Address(rsp, st_off), r);
 721       } else {
 722         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 723         // T_DOUBLE and T_LONG use two slots in the interpreter
 724         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 725           // long/double in gpr
 726 #ifdef ASSERT
 727           // Overwrite the unused slot with known junk
 728           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 729           __ movptr(Address(rsp, st_off), rax);
 730 #endif /* ASSERT */
 731           __ movq(Address(rsp, next_off), r);
 732         } else {
 733           __ movptr(Address(rsp, st_off), r);
 734         }
 735       }
 736     } else {
 737       assert(r_1->is_XMMRegister(), "");
 738       if (!r_2->is_valid()) {
 739         // only a float use just part of the slot
 740         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 741       } else {
 742 #ifdef ASSERT
 743         // Overwrite the unused slot with known junk
 744         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 745         __ movptr(Address(rsp, st_off), rax);
 746 #endif /* ASSERT */
 747         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 748       }
 749     }
 750   }
 751 
 752   // Schedule the branch target address early.
 753   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 754   __ jmp(rcx);
 755 }
 756 
 757 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 758                         address code_start, address code_end,
 759                         Label& L_ok) {
 760   Label L_fail;
 761   __ lea(temp_reg, ExternalAddress(code_start));
 762   __ cmpptr(pc_reg, temp_reg);
 763   __ jcc(Assembler::belowEqual, L_fail);
 764   __ lea(temp_reg, ExternalAddress(code_end));
 765   __ cmpptr(pc_reg, temp_reg);
 766   __ jcc(Assembler::below, L_ok);
 767   __ bind(L_fail);
 768 }
 769 
 770 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 771                                     int total_args_passed,
 772                                     int comp_args_on_stack,
 773                                     const BasicType *sig_bt,
 774                                     const VMRegPair *regs) {
 775 
 776   // Note: r13 contains the senderSP on entry. We must preserve it since
 777   // we may do a i2c -> c2i transition if we lose a race where compiled
 778   // code goes non-entrant while we get args ready.
 779   // In addition we use r13 to locate all the interpreter args as
 780   // we must align the stack to 16 bytes on an i2c entry else we
 781   // lose alignment we expect in all compiled code and register
 782   // save code can segv when fxsave instructions find improperly
 783   // aligned stack pointer.
 784 
 785   // Adapters can be frameless because they do not require the caller
 786   // to perform additional cleanup work, such as correcting the stack pointer.
 787   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 788   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 789   // even if a callee has modified the stack pointer.
 790   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 791   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 792   // up via the senderSP register).
 793   // In other words, if *either* the caller or callee is interpreted, we can
 794   // get the stack pointer repaired after a call.
 795   // This is why c2i and i2c adapters cannot be indefinitely composed.
 796   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 797   // both caller and callee would be compiled methods, and neither would
 798   // clean up the stack pointer changes performed by the two adapters.
 799   // If this happens, control eventually transfers back to the compiled
 800   // caller, but with an uncorrected stack, causing delayed havoc.
 801 
 802   if (VerifyAdapterCalls &&
 803       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 804     // So, let's test for cascading c2i/i2c adapters right now.
 805     //  assert(Interpreter::contains($return_addr) ||
 806     //         StubRoutines::contains($return_addr),
 807     //         "i2c adapter must return to an interpreter frame");
 808     __ block_comment("verify_i2c { ");
 809     // Pick up the return address
 810     __ movptr(rax, Address(rsp, 0));
 811     Label L_ok;
 812     if (Interpreter::code() != nullptr) {
 813       range_check(masm, rax, r11,
 814                   Interpreter::code()->code_start(),
 815                   Interpreter::code()->code_end(),
 816                   L_ok);
 817     }
 818     if (StubRoutines::initial_stubs_code() != nullptr) {
 819       range_check(masm, rax, r11,
 820                   StubRoutines::initial_stubs_code()->code_begin(),
 821                   StubRoutines::initial_stubs_code()->code_end(),
 822                   L_ok);
 823     }
 824     if (StubRoutines::final_stubs_code() != nullptr) {
 825       range_check(masm, rax, r11,
 826                   StubRoutines::final_stubs_code()->code_begin(),
 827                   StubRoutines::final_stubs_code()->code_end(),
 828                   L_ok);
 829     }
 830     const char* msg = "i2c adapter must return to an interpreter frame";
 831     __ block_comment(msg);
 832     __ stop(msg);
 833     __ bind(L_ok);
 834     __ block_comment("} verify_i2ce ");
 835   }
 836 
 837   // Must preserve original SP for loading incoming arguments because
 838   // we need to align the outgoing SP for compiled code.
 839   __ movptr(r11, rsp);
 840 
 841   // Pick up the return address
 842   __ pop(rax);
 843 
 844   // Convert 4-byte c2 stack slots to words.
 845   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 846 
 847   if (comp_args_on_stack) {
 848     __ subptr(rsp, comp_words_on_stack * wordSize);
 849   }
 850 
 851   // Ensure compiled code always sees stack at proper alignment
 852   __ andptr(rsp, -16);
 853 
 854   // push the return address and misalign the stack that youngest frame always sees
 855   // as far as the placement of the call instruction
 856   __ push(rax);
 857 
 858   // Put saved SP in another register
 859   const Register saved_sp = rax;
 860   __ movptr(saved_sp, r11);
 861 
 862   // Will jump to the compiled code just as if compiled code was doing it.
 863   // Pre-load the register-jump target early, to schedule it better.
 864   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 865 
 866 #if INCLUDE_JVMCI
 867   if (EnableJVMCI) {
 868     // check if this call should be routed towards a specific entry point
 869     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 870     Label no_alternative_target;
 871     __ jcc(Assembler::equal, no_alternative_target);
 872     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 873     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 874     __ bind(no_alternative_target);
 875   }
 876 #endif // INCLUDE_JVMCI
 877 
 878   // Now generate the shuffle code.  Pick up all register args and move the
 879   // rest through the floating point stack top.
 880   for (int i = 0; i < total_args_passed; i++) {
 881     if (sig_bt[i] == T_VOID) {
 882       // Longs and doubles are passed in native word order, but misaligned
 883       // in the 32-bit build.
 884       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 885       continue;
 886     }
 887 
 888     // Pick up 0, 1 or 2 words from SP+offset.
 889 
 890     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 891             "scrambled load targets?");
 892     // Load in argument order going down.
 893     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 894     // Point to interpreter value (vs. tag)
 895     int next_off = ld_off - Interpreter::stackElementSize;
 896     //
 897     //
 898     //
 899     VMReg r_1 = regs[i].first();
 900     VMReg r_2 = regs[i].second();
 901     if (!r_1->is_valid()) {
 902       assert(!r_2->is_valid(), "");
 903       continue;
 904     }
 905     if (r_1->is_stack()) {
 906       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 907       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 908 
 909       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 910       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 911       // will be generated.
 912       if (!r_2->is_valid()) {
 913         // sign extend???
 914         __ movl(r13, Address(saved_sp, ld_off));
 915         __ movptr(Address(rsp, st_off), r13);
 916       } else {
 917         //
 918         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 919         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 920         // So we must adjust where to pick up the data to match the interpreter.
 921         //
 922         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 923         // are accessed as negative so LSW is at LOW address
 924 
 925         // ld_off is MSW so get LSW
 926         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 927                            next_off : ld_off;
 928         __ movq(r13, Address(saved_sp, offset));
 929         // st_off is LSW (i.e. reg.first())
 930         __ movq(Address(rsp, st_off), r13);
 931       }
 932     } else if (r_1->is_Register()) {  // Register argument
 933       Register r = r_1->as_Register();
 934       assert(r != rax, "must be different");
 935       if (r_2->is_valid()) {
 936         //
 937         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 938         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 939         // So we must adjust where to pick up the data to match the interpreter.
 940 
 941         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 942                            next_off : ld_off;
 943 
 944         // this can be a misaligned move
 945         __ movq(r, Address(saved_sp, offset));
 946       } else {
 947         // sign extend and use a full word?
 948         __ movl(r, Address(saved_sp, ld_off));
 949       }
 950     } else {
 951       if (!r_2->is_valid()) {
 952         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 953       } else {
 954         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 955       }
 956     }
 957   }
 958 
 959   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 960 
 961   // 6243940 We might end up in handle_wrong_method if
 962   // the callee is deoptimized as we race thru here. If that
 963   // happens we don't want to take a safepoint because the
 964   // caller frame will look interpreted and arguments are now
 965   // "compiled" so it is much better to make this transition
 966   // invisible to the stack walking code. Unfortunately if
 967   // we try and find the callee by normal means a safepoint
 968   // is possible. So we stash the desired callee in the thread
 969   // and the vm will find there should this case occur.
 970 
 971   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 972 
 973   // put Method* where a c2i would expect should we end up there
 974   // only needed because eof c2 resolve stubs return Method* as a result in
 975   // rax
 976   __ mov(rax, rbx);
 977   __ jmp(r11);
 978 }
 979 
 980 // ---------------------------------------------------------------
 981 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 982                                                             int total_args_passed,
 983                                                             int comp_args_on_stack,
 984                                                             const BasicType *sig_bt,
 985                                                             const VMRegPair *regs,
 986                                                             AdapterFingerPrint* fingerprint) {
 987   address i2c_entry = __ pc();
 988 
 989   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 990 
 991   // -------------------------------------------------------------------------
 992   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 993   // to the interpreter.  The args start out packed in the compiled layout.  They
 994   // need to be unpacked into the interpreter layout.  This will almost always
 995   // require some stack space.  We grow the current (compiled) stack, then repack
 996   // the args.  We  finally end in a jump to the generic interpreter entry point.
 997   // On exit from the interpreter, the interpreter will restore our SP (lest the
 998   // compiled code, which relies solely on SP and not RBP, get sick).
 999 
1000   address c2i_unverified_entry = __ pc();
1001   Label skip_fixup;
1002 
1003   Register data = rax;
1004   Register receiver = j_rarg0;
1005   Register temp = rbx;
1006 
1007   {
1008     __ ic_check(1 /* end_alignment */);
1009     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1010     // Method might have been compiled since the call site was patched to
1011     // interpreted if that is the case treat it as a miss so we can get
1012     // the call site corrected.
1013     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1014     __ jcc(Assembler::equal, skip_fixup);
1015     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1016   }
1017 
1018   address c2i_entry = __ pc();
1019 
1020   // Class initialization barrier for static methods
1021   address c2i_no_clinit_check_entry = nullptr;
1022   if (VM_Version::supports_fast_class_init_checks()) {
1023     Label L_skip_barrier;
1024     Register method = rbx;
1025 
1026     { // Bypass the barrier for non-static methods
1027       Register flags = rscratch1;
1028       __ movl(flags, Address(method, Method::access_flags_offset()));
1029       __ testl(flags, JVM_ACC_STATIC);
1030       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1031     }
1032 
1033     Register klass = rscratch1;
1034     __ load_method_holder(klass, method);
1035     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1036 
1037     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1038 
1039     __ bind(L_skip_barrier);
1040     c2i_no_clinit_check_entry = __ pc();
1041   }
1042 
1043   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1044   bs->c2i_entry_barrier(masm);
1045 
1046   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1047 
1048   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1049 }
1050 
1051 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1052                                          VMRegPair *regs,
1053                                          int total_args_passed) {
1054 
1055 // We return the amount of VMRegImpl stack slots we need to reserve for all
1056 // the arguments NOT counting out_preserve_stack_slots.
1057 
1058 // NOTE: These arrays will have to change when c1 is ported
1059 #ifdef _WIN64
1060     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1061       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1062     };
1063     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1064       c_farg0, c_farg1, c_farg2, c_farg3
1065     };
1066 #else
1067     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1068       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1069     };
1070     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1071       c_farg0, c_farg1, c_farg2, c_farg3,
1072       c_farg4, c_farg5, c_farg6, c_farg7
1073     };
1074 #endif // _WIN64
1075 
1076 
1077     uint int_args = 0;
1078     uint fp_args = 0;
1079     uint stk_args = 0; // inc by 2 each time
1080 
1081     for (int i = 0; i < total_args_passed; i++) {
1082       switch (sig_bt[i]) {
1083       case T_BOOLEAN:
1084       case T_CHAR:
1085       case T_BYTE:
1086       case T_SHORT:
1087       case T_INT:
1088         if (int_args < Argument::n_int_register_parameters_c) {
1089           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1090 #ifdef _WIN64
1091           fp_args++;
1092           // Allocate slots for callee to stuff register args the stack.
1093           stk_args += 2;
1094 #endif
1095         } else {
1096           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1097           stk_args += 2;
1098         }
1099         break;
1100       case T_LONG:
1101         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1102         // fall through
1103       case T_OBJECT:
1104       case T_ARRAY:
1105       case T_ADDRESS:
1106       case T_METADATA:
1107         if (int_args < Argument::n_int_register_parameters_c) {
1108           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1109 #ifdef _WIN64
1110           fp_args++;
1111           stk_args += 2;
1112 #endif
1113         } else {
1114           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1115           stk_args += 2;
1116         }
1117         break;
1118       case T_FLOAT:
1119         if (fp_args < Argument::n_float_register_parameters_c) {
1120           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1121 #ifdef _WIN64
1122           int_args++;
1123           // Allocate slots for callee to stuff register args the stack.
1124           stk_args += 2;
1125 #endif
1126         } else {
1127           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1128           stk_args += 2;
1129         }
1130         break;
1131       case T_DOUBLE:
1132         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1133         if (fp_args < Argument::n_float_register_parameters_c) {
1134           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1135 #ifdef _WIN64
1136           int_args++;
1137           // Allocate slots for callee to stuff register args the stack.
1138           stk_args += 2;
1139 #endif
1140         } else {
1141           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1142           stk_args += 2;
1143         }
1144         break;
1145       case T_VOID: // Halves of longs and doubles
1146         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1147         regs[i].set_bad();
1148         break;
1149       default:
1150         ShouldNotReachHere();
1151         break;
1152       }
1153     }
1154 #ifdef _WIN64
1155   // windows abi requires that we always allocate enough stack space
1156   // for 4 64bit registers to be stored down.
1157   if (stk_args < 8) {
1158     stk_args = 8;
1159   }
1160 #endif // _WIN64
1161 
1162   return stk_args;
1163 }
1164 
1165 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1166                                              uint num_bits,
1167                                              uint total_args_passed) {
1168   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1169          "only certain vector sizes are supported for now");
1170 
1171   static const XMMRegister VEC_ArgReg[32] = {
1172      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1173      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1174     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1175     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1176   };
1177 
1178   uint stk_args = 0;
1179   uint fp_args = 0;
1180 
1181   for (uint i = 0; i < total_args_passed; i++) {
1182     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1183     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1184     regs[i].set_pair(vmreg->next(next_val), vmreg);
1185   }
1186 
1187   return stk_args;
1188 }
1189 
1190 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1191   // We always ignore the frame_slots arg and just use the space just below frame pointer
1192   // which by this time is free to use
1193   switch (ret_type) {
1194   case T_FLOAT:
1195     __ movflt(Address(rbp, -wordSize), xmm0);
1196     break;
1197   case T_DOUBLE:
1198     __ movdbl(Address(rbp, -wordSize), xmm0);
1199     break;
1200   case T_VOID:  break;
1201   default: {
1202     __ movptr(Address(rbp, -wordSize), rax);
1203     }
1204   }
1205 }
1206 
1207 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1208   // We always ignore the frame_slots arg and just use the space just below frame pointer
1209   // which by this time is free to use
1210   switch (ret_type) {
1211   case T_FLOAT:
1212     __ movflt(xmm0, Address(rbp, -wordSize));
1213     break;
1214   case T_DOUBLE:
1215     __ movdbl(xmm0, Address(rbp, -wordSize));
1216     break;
1217   case T_VOID:  break;
1218   default: {
1219     __ movptr(rax, Address(rbp, -wordSize));
1220     }
1221   }
1222 }
1223 
1224 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1225     for ( int i = first_arg ; i < arg_count ; i++ ) {
1226       if (args[i].first()->is_Register()) {
1227         __ push(args[i].first()->as_Register());
1228       } else if (args[i].first()->is_XMMRegister()) {
1229         __ subptr(rsp, 2*wordSize);
1230         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1231       }
1232     }
1233 }
1234 
1235 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1236     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1237       if (args[i].first()->is_Register()) {
1238         __ pop(args[i].first()->as_Register());
1239       } else if (args[i].first()->is_XMMRegister()) {
1240         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1241         __ addptr(rsp, 2*wordSize);
1242       }
1243     }
1244 }
1245 
1246 static void verify_oop_args(MacroAssembler* masm,
1247                             const methodHandle& method,
1248                             const BasicType* sig_bt,
1249                             const VMRegPair* regs) {
1250   Register temp_reg = rbx;  // not part of any compiled calling seq
1251   if (VerifyOops) {
1252     for (int i = 0; i < method->size_of_parameters(); i++) {
1253       if (is_reference_type(sig_bt[i])) {
1254         VMReg r = regs[i].first();
1255         assert(r->is_valid(), "bad oop arg");
1256         if (r->is_stack()) {
1257           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1258           __ verify_oop(temp_reg);
1259         } else {
1260           __ verify_oop(r->as_Register());
1261         }
1262       }
1263     }
1264   }
1265 }
1266 
1267 static void check_continuation_enter_argument(VMReg actual_vmreg,
1268                                               Register expected_reg,
1269                                               const char* name) {
1270   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1271   assert(actual_vmreg->as_Register() == expected_reg,
1272          "%s is in unexpected register: %s instead of %s",
1273          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1274 }
1275 
1276 
1277 //---------------------------- continuation_enter_setup ---------------------------
1278 //
1279 // Arguments:
1280 //   None.
1281 //
1282 // Results:
1283 //   rsp: pointer to blank ContinuationEntry
1284 //
1285 // Kills:
1286 //   rax
1287 //
1288 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1289   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1290   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1291   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1292 
1293   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1294   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1295 
1296   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1297   OopMap* map = new OopMap(frame_size, 0);
1298 
1299   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1300   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1301   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1302 
1303   return map;
1304 }
1305 
1306 //---------------------------- fill_continuation_entry ---------------------------
1307 //
1308 // Arguments:
1309 //   rsp: pointer to blank Continuation entry
1310 //   reg_cont_obj: pointer to the continuation
1311 //   reg_flags: flags
1312 //
1313 // Results:
1314 //   rsp: pointer to filled out ContinuationEntry
1315 //
1316 // Kills:
1317 //   rax
1318 //
1319 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1320   assert_different_registers(rax, reg_cont_obj, reg_flags);
1321 #ifdef ASSERT
1322   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1323 #endif
1324   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1325   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1326   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1327   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1328   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1329 
1330   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1331   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1332   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1333   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1334 
1335   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1336   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1337 }
1338 
1339 //---------------------------- continuation_enter_cleanup ---------------------------
1340 //
1341 // Arguments:
1342 //   rsp: pointer to the ContinuationEntry
1343 //
1344 // Results:
1345 //   rsp: pointer to the spilled rbp in the entry frame
1346 //
1347 // Kills:
1348 //   rbx
1349 //
1350 void static continuation_enter_cleanup(MacroAssembler* masm) {
1351 #ifdef ASSERT
1352   Label L_good_sp;
1353   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1354   __ jcc(Assembler::equal, L_good_sp);
1355   __ stop("Incorrect rsp at continuation_enter_cleanup");
1356   __ bind(L_good_sp);
1357 #endif
1358 
1359   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1360   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1361   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1362   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1363 
1364   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1365   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1366   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1367 }
1368 
1369 static void gen_continuation_enter(MacroAssembler* masm,
1370                                    const VMRegPair* regs,
1371                                    int& exception_offset,
1372                                    OopMapSet* oop_maps,
1373                                    int& frame_complete,
1374                                    int& stack_slots,
1375                                    int& interpreted_entry_offset,
1376                                    int& compiled_entry_offset) {
1377 
1378   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1379   int pos_cont_obj   = 0;
1380   int pos_is_cont    = 1;
1381   int pos_is_virtual = 2;
1382 
1383   // The platform-specific calling convention may present the arguments in various registers.
1384   // To simplify the rest of the code, we expect the arguments to reside at these known
1385   // registers, and we additionally check the placement here in case calling convention ever
1386   // changes.
1387   Register reg_cont_obj   = c_rarg1;
1388   Register reg_is_cont    = c_rarg2;
1389   Register reg_is_virtual = c_rarg3;
1390 
1391   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1392   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1393   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1394 
1395   // Utility methods kill rax, make sure there are no collisions
1396   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1397 
1398   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1399                          relocInfo::static_call_type);
1400 
1401   address start = __ pc();
1402 
1403   Label L_thaw, L_exit;
1404 
1405   // i2i entry used at interp_only_mode only
1406   interpreted_entry_offset = __ pc() - start;
1407   {
1408 #ifdef ASSERT
1409     Label is_interp_only;
1410     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1411     __ jcc(Assembler::notEqual, is_interp_only);
1412     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1413     __ bind(is_interp_only);
1414 #endif
1415 
1416     __ pop(rax); // return address
1417     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1418     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1419     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1420     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1421     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1422     __ push(rax); // return address
1423     __ push_cont_fastpath();
1424 
1425     __ enter();
1426 
1427     stack_slots = 2; // will be adjusted in setup
1428     OopMap* map = continuation_enter_setup(masm, stack_slots);
1429     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1430     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1431 
1432     __ verify_oop(reg_cont_obj);
1433 
1434     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1435 
1436     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1437     __ testptr(reg_is_cont, reg_is_cont);
1438     __ jcc(Assembler::notZero, L_thaw);
1439 
1440     // --- Resolve path
1441 
1442     // Make sure the call is patchable
1443     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1444     // Emit stub for static call
1445     CodeBuffer* cbuf = masm->code_section()->outer();
1446     address stub = CompiledDirectCall::emit_to_interp_stub(*cbuf, __ pc());
1447     if (stub == nullptr) {
1448       fatal("CodeCache is full at gen_continuation_enter");
1449     }
1450     __ call(resolve);
1451     oop_maps->add_gc_map(__ pc() - start, map);
1452     __ post_call_nop();
1453 
1454     __ jmp(L_exit);
1455   }
1456 
1457   // compiled entry
1458   __ align(CodeEntryAlignment);
1459   compiled_entry_offset = __ pc() - start;
1460   __ enter();
1461 
1462   stack_slots = 2; // will be adjusted in setup
1463   OopMap* map = continuation_enter_setup(masm, stack_slots);
1464 
1465   // Frame is now completed as far as size and linkage.
1466   frame_complete = __ pc() - start;
1467 
1468   __ verify_oop(reg_cont_obj);
1469 
1470   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1471 
1472   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1473   __ testptr(reg_is_cont, reg_is_cont);
1474   __ jccb(Assembler::notZero, L_thaw);
1475 
1476   // --- call Continuation.enter(Continuation c, boolean isContinue)
1477 
1478   // Make sure the call is patchable
1479   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1480 
1481   // Emit stub for static call
1482   CodeBuffer* cbuf = masm->code_section()->outer();
1483   address stub = CompiledDirectCall::emit_to_interp_stub(*cbuf, __ pc());
1484   if (stub == nullptr) {
1485     fatal("CodeCache is full at gen_continuation_enter");
1486   }
1487 
1488   // The call needs to be resolved. There's a special case for this in
1489   // SharedRuntime::find_callee_info_helper() which calls
1490   // LinkResolver::resolve_continuation_enter() which resolves the call to
1491   // Continuation.enter(Continuation c, boolean isContinue).
1492   __ call(resolve);
1493 
1494   oop_maps->add_gc_map(__ pc() - start, map);
1495   __ post_call_nop();
1496 
1497   __ jmpb(L_exit);
1498 
1499   // --- Thawing path
1500 
1501   __ bind(L_thaw);
1502 
1503   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1504 
1505   ContinuationEntry::_return_pc_offset = __ pc() - start;
1506   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1507   __ post_call_nop();
1508 
1509   // --- Normal exit (resolve/thawing)
1510 
1511   __ bind(L_exit);
1512 
1513   continuation_enter_cleanup(masm);
1514   __ pop(rbp);
1515   __ ret(0);
1516 
1517   // --- Exception handling path
1518 
1519   exception_offset = __ pc() - start;
1520 
1521   continuation_enter_cleanup(masm);
1522   __ pop(rbp);
1523 
1524   __ movptr(c_rarg0, r15_thread);
1525   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1526 
1527   // rax still holds the original exception oop, save it before the call
1528   __ push(rax);
1529 
1530   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1531   __ movptr(rbx, rax);
1532 
1533   // Continue at exception handler:
1534   //   rax: exception oop
1535   //   rbx: exception handler
1536   //   rdx: exception pc
1537   __ pop(rax);
1538   __ verify_oop(rax);
1539   __ pop(rdx);
1540   __ jmp(rbx);
1541 }
1542 
1543 static void gen_continuation_yield(MacroAssembler* masm,
1544                                    const VMRegPair* regs,
1545                                    OopMapSet* oop_maps,
1546                                    int& frame_complete,
1547                                    int& stack_slots,
1548                                    int& compiled_entry_offset) {
1549   enum layout {
1550     rbp_off,
1551     rbpH_off,
1552     return_off,
1553     return_off2,
1554     framesize // inclusive of return address
1555   };
1556   stack_slots = framesize /  VMRegImpl::slots_per_word;
1557   assert(stack_slots == 2, "recheck layout");
1558 
1559   address start = __ pc();
1560   compiled_entry_offset = __ pc() - start;
1561   __ enter();
1562   address the_pc = __ pc();
1563 
1564   frame_complete = the_pc - start;
1565 
1566   // This nop must be exactly at the PC we push into the frame info.
1567   // We use this nop for fast CodeBlob lookup, associate the OopMap
1568   // with it right away.
1569   __ post_call_nop();
1570   OopMap* map = new OopMap(framesize, 1);
1571   oop_maps->add_gc_map(frame_complete, map);
1572 
1573   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1574   __ movptr(c_rarg0, r15_thread);
1575   __ movptr(c_rarg1, rsp);
1576   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1577   __ reset_last_Java_frame(true);
1578 
1579   Label L_pinned;
1580 
1581   __ testptr(rax, rax);
1582   __ jcc(Assembler::notZero, L_pinned);
1583 
1584   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1585   continuation_enter_cleanup(masm);
1586   __ pop(rbp);
1587   __ ret(0);
1588 
1589   __ bind(L_pinned);
1590 
1591   // Pinned, return to caller
1592 
1593   // handle pending exception thrown by freeze
1594   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1595   Label ok;
1596   __ jcc(Assembler::equal, ok);
1597   __ leave();
1598   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1599   __ bind(ok);
1600 
1601   __ leave();
1602   __ ret(0);
1603 }
1604 
1605 static void gen_special_dispatch(MacroAssembler* masm,
1606                                  const methodHandle& method,
1607                                  const BasicType* sig_bt,
1608                                  const VMRegPair* regs) {
1609   verify_oop_args(masm, method, sig_bt, regs);
1610   vmIntrinsics::ID iid = method->intrinsic_id();
1611 
1612   // Now write the args into the outgoing interpreter space
1613   bool     has_receiver   = false;
1614   Register receiver_reg   = noreg;
1615   int      member_arg_pos = -1;
1616   Register member_reg     = noreg;
1617   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1618   if (ref_kind != 0) {
1619     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1620     member_reg = rbx;  // known to be free at this point
1621     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1622   } else if (iid == vmIntrinsics::_invokeBasic) {
1623     has_receiver = true;
1624   } else if (iid == vmIntrinsics::_linkToNative) {
1625     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1626     member_reg = rbx;  // known to be free at this point
1627   } else {
1628     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1629   }
1630 
1631   if (member_reg != noreg) {
1632     // Load the member_arg into register, if necessary.
1633     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1634     VMReg r = regs[member_arg_pos].first();
1635     if (r->is_stack()) {
1636       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1637     } else {
1638       // no data motion is needed
1639       member_reg = r->as_Register();
1640     }
1641   }
1642 
1643   if (has_receiver) {
1644     // Make sure the receiver is loaded into a register.
1645     assert(method->size_of_parameters() > 0, "oob");
1646     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1647     VMReg r = regs[0].first();
1648     assert(r->is_valid(), "bad receiver arg");
1649     if (r->is_stack()) {
1650       // Porting note:  This assumes that compiled calling conventions always
1651       // pass the receiver oop in a register.  If this is not true on some
1652       // platform, pick a temp and load the receiver from stack.
1653       fatal("receiver always in a register");
1654       receiver_reg = j_rarg0;  // known to be free at this point
1655       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1656     } else {
1657       // no data motion is needed
1658       receiver_reg = r->as_Register();
1659     }
1660   }
1661 
1662   // Figure out which address we are really jumping to:
1663   MethodHandles::generate_method_handle_dispatch(masm, iid,
1664                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1665 }
1666 
1667 // ---------------------------------------------------------------------------
1668 // Generate a native wrapper for a given method.  The method takes arguments
1669 // in the Java compiled code convention, marshals them to the native
1670 // convention (handlizes oops, etc), transitions to native, makes the call,
1671 // returns to java state (possibly blocking), unhandlizes any result and
1672 // returns.
1673 //
1674 // Critical native functions are a shorthand for the use of
1675 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1676 // functions.  The wrapper is expected to unpack the arguments before
1677 // passing them to the callee. Critical native functions leave the state _in_Java,
1678 // since they cannot stop for GC.
1679 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1680 // block and the check for pending exceptions it's impossible for them
1681 // to be thrown.
1682 //
1683 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1684                                                 const methodHandle& method,
1685                                                 int compile_id,
1686                                                 BasicType* in_sig_bt,
1687                                                 VMRegPair* in_regs,
1688                                                 BasicType ret_type) {
1689   if (method->is_continuation_native_intrinsic()) {
1690     int exception_offset = -1;
1691     OopMapSet* oop_maps = new OopMapSet();
1692     int frame_complete = -1;
1693     int stack_slots = -1;
1694     int interpreted_entry_offset = -1;
1695     int vep_offset = -1;
1696     if (method->is_continuation_enter_intrinsic()) {
1697       gen_continuation_enter(masm,
1698                              in_regs,
1699                              exception_offset,
1700                              oop_maps,
1701                              frame_complete,
1702                              stack_slots,
1703                              interpreted_entry_offset,
1704                              vep_offset);
1705     } else if (method->is_continuation_yield_intrinsic()) {
1706       gen_continuation_yield(masm,
1707                              in_regs,
1708                              oop_maps,
1709                              frame_complete,
1710                              stack_slots,
1711                              vep_offset);
1712     } else {
1713       guarantee(false, "Unknown Continuation native intrinsic");
1714     }
1715 
1716 #ifdef ASSERT
1717     if (method->is_continuation_enter_intrinsic()) {
1718       assert(interpreted_entry_offset != -1, "Must be set");
1719       assert(exception_offset != -1,         "Must be set");
1720     } else {
1721       assert(interpreted_entry_offset == -1, "Must be unset");
1722       assert(exception_offset == -1,         "Must be unset");
1723     }
1724     assert(frame_complete != -1,    "Must be set");
1725     assert(stack_slots != -1,       "Must be set");
1726     assert(vep_offset != -1,        "Must be set");
1727 #endif
1728 
1729     __ flush();
1730     nmethod* nm = nmethod::new_native_nmethod(method,
1731                                               compile_id,
1732                                               masm->code(),
1733                                               vep_offset,
1734                                               frame_complete,
1735                                               stack_slots,
1736                                               in_ByteSize(-1),
1737                                               in_ByteSize(-1),
1738                                               oop_maps,
1739                                               exception_offset);
1740     if (nm == nullptr) return nm;
1741     if (method->is_continuation_enter_intrinsic()) {
1742       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1743     } else if (method->is_continuation_yield_intrinsic()) {
1744       _cont_doYield_stub = nm;
1745     }
1746     return nm;
1747   }
1748 
1749   if (method->is_method_handle_intrinsic()) {
1750     vmIntrinsics::ID iid = method->intrinsic_id();
1751     intptr_t start = (intptr_t)__ pc();
1752     int vep_offset = ((intptr_t)__ pc()) - start;
1753     gen_special_dispatch(masm,
1754                          method,
1755                          in_sig_bt,
1756                          in_regs);
1757     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1758     __ flush();
1759     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1760     return nmethod::new_native_nmethod(method,
1761                                        compile_id,
1762                                        masm->code(),
1763                                        vep_offset,
1764                                        frame_complete,
1765                                        stack_slots / VMRegImpl::slots_per_word,
1766                                        in_ByteSize(-1),
1767                                        in_ByteSize(-1),
1768                                        nullptr);
1769   }
1770   address native_func = method->native_function();
1771   assert(native_func != nullptr, "must have function");
1772 
1773   // An OopMap for lock (and class if static)
1774   OopMapSet *oop_maps = new OopMapSet();
1775   intptr_t start = (intptr_t)__ pc();
1776 
1777   // We have received a description of where all the java arg are located
1778   // on entry to the wrapper. We need to convert these args to where
1779   // the jni function will expect them. To figure out where they go
1780   // we convert the java signature to a C signature by inserting
1781   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1782 
1783   const int total_in_args = method->size_of_parameters();
1784   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1785 
1786   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1787   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1788   BasicType* in_elem_bt = nullptr;
1789 
1790   int argc = 0;
1791   out_sig_bt[argc++] = T_ADDRESS;
1792   if (method->is_static()) {
1793     out_sig_bt[argc++] = T_OBJECT;
1794   }
1795 
1796   for (int i = 0; i < total_in_args ; i++ ) {
1797     out_sig_bt[argc++] = in_sig_bt[i];
1798   }
1799 
1800   // Now figure out where the args must be stored and how much stack space
1801   // they require.
1802   int out_arg_slots;
1803   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1804 
1805   // Compute framesize for the wrapper.  We need to handlize all oops in
1806   // incoming registers
1807 
1808   // Calculate the total number of stack slots we will need.
1809 
1810   // First count the abi requirement plus all of the outgoing args
1811   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1812 
1813   // Now the space for the inbound oop handle area
1814   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1815 
1816   int oop_handle_offset = stack_slots;
1817   stack_slots += total_save_slots;
1818 
1819   // Now any space we need for handlizing a klass if static method
1820 
1821   int klass_slot_offset = 0;
1822   int klass_offset = -1;
1823   int lock_slot_offset = 0;
1824   bool is_static = false;
1825 
1826   if (method->is_static()) {
1827     klass_slot_offset = stack_slots;
1828     stack_slots += VMRegImpl::slots_per_word;
1829     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1830     is_static = true;
1831   }
1832 
1833   // Plus a lock if needed
1834 
1835   if (method->is_synchronized()) {
1836     lock_slot_offset = stack_slots;
1837     stack_slots += VMRegImpl::slots_per_word;
1838   }
1839 
1840   // Now a place (+2) to save return values or temp during shuffling
1841   // + 4 for return address (which we own) and saved rbp
1842   stack_slots += 6;
1843 
1844   // Ok The space we have allocated will look like:
1845   //
1846   //
1847   // FP-> |                     |
1848   //      |---------------------|
1849   //      | 2 slots for moves   |
1850   //      |---------------------|
1851   //      | lock box (if sync)  |
1852   //      |---------------------| <- lock_slot_offset
1853   //      | klass (if static)   |
1854   //      |---------------------| <- klass_slot_offset
1855   //      | oopHandle area      |
1856   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1857   //      | outbound memory     |
1858   //      | based arguments     |
1859   //      |                     |
1860   //      |---------------------|
1861   //      |                     |
1862   // SP-> | out_preserved_slots |
1863   //
1864   //
1865 
1866 
1867   // Now compute actual number of stack words we need rounding to make
1868   // stack properly aligned.
1869   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1870 
1871   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1872 
1873   // First thing make an ic check to see if we should even be here
1874 
1875   // We are free to use all registers as temps without saving them and
1876   // restoring them except rbp. rbp is the only callee save register
1877   // as far as the interpreter and the compiler(s) are concerned.
1878 
1879   const Register receiver = j_rarg0;
1880 
1881   Label exception_pending;
1882 
1883   assert_different_registers(receiver, rscratch1, rscratch2);
1884   __ verify_oop(receiver);
1885   __ ic_check(8 /* end_alignment */);
1886 
1887   int vep_offset = ((intptr_t)__ pc()) - start;
1888 
1889   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1890     Label L_skip_barrier;
1891     Register klass = r10;
1892     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1893     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1894 
1895     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1896 
1897     __ bind(L_skip_barrier);
1898   }
1899 
1900 #ifdef COMPILER1
1901   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1902   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1903     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1904   }
1905 #endif // COMPILER1
1906 
1907   // The instruction at the verified entry point must be 5 bytes or longer
1908   // because it can be patched on the fly by make_non_entrant. The stack bang
1909   // instruction fits that requirement.
1910 
1911   // Generate stack overflow check
1912   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1913 
1914   // Generate a new frame for the wrapper.
1915   __ enter();
1916   // -2 because return address is already present and so is saved rbp
1917   __ subptr(rsp, stack_size - 2*wordSize);
1918 
1919   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1920   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1921   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1922 
1923   // Frame is now completed as far as size and linkage.
1924   int frame_complete = ((intptr_t)__ pc()) - start;
1925 
1926     if (UseRTMLocking) {
1927       // Abort RTM transaction before calling JNI
1928       // because critical section will be large and will be
1929       // aborted anyway. Also nmethod could be deoptimized.
1930       __ xabort(0);
1931     }
1932 
1933 #ifdef ASSERT
1934   __ check_stack_alignment(rsp, "improperly aligned stack");
1935 #endif /* ASSERT */
1936 
1937 
1938   // We use r14 as the oop handle for the receiver/klass
1939   // It is callee save so it survives the call to native
1940 
1941   const Register oop_handle_reg = r14;
1942 
1943   //
1944   // We immediately shuffle the arguments so that any vm call we have to
1945   // make from here on out (sync slow path, jvmti, etc.) we will have
1946   // captured the oops from our caller and have a valid oopMap for
1947   // them.
1948 
1949   // -----------------
1950   // The Grand Shuffle
1951 
1952   // The Java calling convention is either equal (linux) or denser (win64) than the
1953   // c calling convention. However the because of the jni_env argument the c calling
1954   // convention always has at least one more (and two for static) arguments than Java.
1955   // Therefore if we move the args from java -> c backwards then we will never have
1956   // a register->register conflict and we don't have to build a dependency graph
1957   // and figure out how to break any cycles.
1958   //
1959 
1960   // Record esp-based slot for receiver on stack for non-static methods
1961   int receiver_offset = -1;
1962 
1963   // This is a trick. We double the stack slots so we can claim
1964   // the oops in the caller's frame. Since we are sure to have
1965   // more args than the caller doubling is enough to make
1966   // sure we can capture all the incoming oop args from the
1967   // caller.
1968   //
1969   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1970 
1971   // Mark location of rbp (someday)
1972   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1973 
1974   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1975   // All inbound args are referenced based on rbp and all outbound args via rsp.
1976 
1977 
1978 #ifdef ASSERT
1979   bool reg_destroyed[Register::number_of_registers];
1980   bool freg_destroyed[XMMRegister::number_of_registers];
1981   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1982     reg_destroyed[r] = false;
1983   }
1984   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1985     freg_destroyed[f] = false;
1986   }
1987 
1988 #endif /* ASSERT */
1989 
1990   // For JNI natives the incoming and outgoing registers are offset upwards.
1991   GrowableArray<int> arg_order(2 * total_in_args);
1992 
1993   VMRegPair tmp_vmreg;
1994   tmp_vmreg.set2(rbx->as_VMReg());
1995 
1996   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1997     arg_order.push(i);
1998     arg_order.push(c_arg);
1999   }
2000 
2001   int temploc = -1;
2002   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2003     int i = arg_order.at(ai);
2004     int c_arg = arg_order.at(ai + 1);
2005     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2006 #ifdef ASSERT
2007     if (in_regs[i].first()->is_Register()) {
2008       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2009     } else if (in_regs[i].first()->is_XMMRegister()) {
2010       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2011     }
2012     if (out_regs[c_arg].first()->is_Register()) {
2013       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2014     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2015       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2016     }
2017 #endif /* ASSERT */
2018     switch (in_sig_bt[i]) {
2019       case T_ARRAY:
2020       case T_OBJECT:
2021         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2022                     ((i == 0) && (!is_static)),
2023                     &receiver_offset);
2024         break;
2025       case T_VOID:
2026         break;
2027 
2028       case T_FLOAT:
2029         __ float_move(in_regs[i], out_regs[c_arg]);
2030           break;
2031 
2032       case T_DOUBLE:
2033         assert( i + 1 < total_in_args &&
2034                 in_sig_bt[i + 1] == T_VOID &&
2035                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2036         __ double_move(in_regs[i], out_regs[c_arg]);
2037         break;
2038 
2039       case T_LONG :
2040         __ long_move(in_regs[i], out_regs[c_arg]);
2041         break;
2042 
2043       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2044 
2045       default:
2046         __ move32_64(in_regs[i], out_regs[c_arg]);
2047     }
2048   }
2049 
2050   int c_arg;
2051 
2052   // Pre-load a static method's oop into r14.  Used both by locking code and
2053   // the normal JNI call code.
2054   // point c_arg at the first arg that is already loaded in case we
2055   // need to spill before we call out
2056   c_arg = total_c_args - total_in_args;
2057 
2058   if (method->is_static()) {
2059 
2060     //  load oop into a register
2061     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2062 
2063     // Now handlize the static class mirror it's known not-null.
2064     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2065     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2066 
2067     // Now get the handle
2068     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2069     // store the klass handle as second argument
2070     __ movptr(c_rarg1, oop_handle_reg);
2071     // and protect the arg if we must spill
2072     c_arg--;
2073   }
2074 
2075   // Change state to native (we save the return address in the thread, since it might not
2076   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2077   // points into the right code segment. It does not have to be the correct return pc.
2078   // We use the same pc/oopMap repeatedly when we call out
2079 
2080   intptr_t the_pc = (intptr_t) __ pc();
2081   oop_maps->add_gc_map(the_pc - start, map);
2082 
2083   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2084 
2085 
2086   // We have all of the arguments setup at this point. We must not touch any register
2087   // argument registers at this point (what if we save/restore them there are no oop?
2088 
2089   {
2090     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2091     // protect the args we've loaded
2092     save_args(masm, total_c_args, c_arg, out_regs);
2093     __ mov_metadata(c_rarg1, method());
2094     __ call_VM_leaf(
2095       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2096       r15_thread, c_rarg1);
2097     restore_args(masm, total_c_args, c_arg, out_regs);
2098   }
2099 
2100   // RedefineClasses() tracing support for obsolete method entry
2101   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2102     // protect the args we've loaded
2103     save_args(masm, total_c_args, c_arg, out_regs);
2104     __ mov_metadata(c_rarg1, method());
2105     __ call_VM_leaf(
2106       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2107       r15_thread, c_rarg1);
2108     restore_args(masm, total_c_args, c_arg, out_regs);
2109   }
2110 
2111   // Lock a synchronized method
2112 
2113   // Register definitions used by locking and unlocking
2114 
2115   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2116   const Register obj_reg  = rbx;  // Will contain the oop
2117   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2118   const Register old_hdr  = r13;  // value of old header at unlock time
2119 
2120   Label slow_path_lock;
2121   Label lock_done;
2122 
2123   if (method->is_synchronized()) {
2124     Label count_mon;
2125 
2126     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2127 
2128     // Get the handle (the 2nd argument)
2129     __ mov(oop_handle_reg, c_rarg1);
2130 
2131     // Get address of the box
2132 
2133     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2134 
2135     // Load the oop from the handle
2136     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2137 
2138     if (LockingMode == LM_MONITOR) {
2139       __ jmp(slow_path_lock);
2140     } else if (LockingMode == LM_LEGACY) {
2141       // Load immediate 1 into swap_reg %rax
2142       __ movl(swap_reg, 1);
2143 
2144       // Load (object->mark() | 1) into swap_reg %rax
2145       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2146 
2147       // Save (object->mark() | 1) into BasicLock's displaced header
2148       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2149 
2150       // src -> dest iff dest == rax else rax <- dest
2151       __ lock();
2152       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2153       __ jcc(Assembler::equal, count_mon);
2154 
2155       // Hmm should this move to the slow path code area???
2156 
2157       // Test if the oopMark is an obvious stack pointer, i.e.,
2158       //  1) (mark & 3) == 0, and
2159       //  2) rsp <= mark < mark + os::pagesize()
2160       // These 3 tests can be done by evaluating the following
2161       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2162       // assuming both stack pointer and pagesize have their
2163       // least significant 2 bits clear.
2164       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2165 
2166       __ subptr(swap_reg, rsp);
2167       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2168 
2169       // Save the test result, for recursive case, the result is zero
2170       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2171       __ jcc(Assembler::notEqual, slow_path_lock);
2172     } else {
2173       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2174       __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2175     }
2176     __ bind(count_mon);
2177     __ inc_held_monitor_count();
2178 
2179     // Slow path will re-enter here
2180     __ bind(lock_done);
2181   }
2182 
2183   // Finally just about ready to make the JNI call
2184 
2185   // get JNIEnv* which is first argument to native
2186   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2187 
2188   // Now set thread in native
2189   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2190 
2191   __ call(RuntimeAddress(native_func));
2192 
2193   // Verify or restore cpu control state after JNI call
2194   __ restore_cpu_control_state_after_jni(rscratch1);
2195 
2196   // Unpack native results.
2197   switch (ret_type) {
2198   case T_BOOLEAN: __ c2bool(rax);            break;
2199   case T_CHAR   : __ movzwl(rax, rax);      break;
2200   case T_BYTE   : __ sign_extend_byte (rax); break;
2201   case T_SHORT  : __ sign_extend_short(rax); break;
2202   case T_INT    : /* nothing to do */        break;
2203   case T_DOUBLE :
2204   case T_FLOAT  :
2205     // Result is in xmm0 we'll save as needed
2206     break;
2207   case T_ARRAY:                 // Really a handle
2208   case T_OBJECT:                // Really a handle
2209       break; // can't de-handlize until after safepoint check
2210   case T_VOID: break;
2211   case T_LONG: break;
2212   default       : ShouldNotReachHere();
2213   }
2214 
2215   Label after_transition;
2216 
2217   // Switch thread to "native transition" state before reading the synchronization state.
2218   // This additional state is necessary because reading and testing the synchronization
2219   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2220   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2221   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2222   //     Thread A is resumed to finish this native method, but doesn't block here since it
2223   //     didn't see any synchronization is progress, and escapes.
2224   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2225 
2226   // Force this write out before the read below
2227   if (!UseSystemMemoryBarrier) {
2228     __ membar(Assembler::Membar_mask_bits(
2229               Assembler::LoadLoad | Assembler::LoadStore |
2230               Assembler::StoreLoad | Assembler::StoreStore));
2231   }
2232 
2233   // check for safepoint operation in progress and/or pending suspend requests
2234   {
2235     Label Continue;
2236     Label slow_path;
2237 
2238     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2239 
2240     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2241     __ jcc(Assembler::equal, Continue);
2242     __ bind(slow_path);
2243 
2244     // Don't use call_VM as it will see a possible pending exception and forward it
2245     // and never return here preventing us from clearing _last_native_pc down below.
2246     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2247     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2248     // by hand.
2249     //
2250     __ vzeroupper();
2251     save_native_result(masm, ret_type, stack_slots);
2252     __ mov(c_rarg0, r15_thread);
2253     __ mov(r12, rsp); // remember sp
2254     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2255     __ andptr(rsp, -16); // align stack as required by ABI
2256     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2257     __ mov(rsp, r12); // restore sp
2258     __ reinit_heapbase();
2259     // Restore any method result value
2260     restore_native_result(masm, ret_type, stack_slots);
2261     __ bind(Continue);
2262   }
2263 
2264   // change thread state
2265   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2266   __ bind(after_transition);
2267 
2268   Label reguard;
2269   Label reguard_done;
2270   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2271   __ jcc(Assembler::equal, reguard);
2272   __ bind(reguard_done);
2273 
2274   // native result if any is live
2275 
2276   // Unlock
2277   Label slow_path_unlock;
2278   Label unlock_done;
2279   if (method->is_synchronized()) {
2280 
2281     Label fast_done;
2282 
2283     // Get locked oop from the handle we passed to jni
2284     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2285 
2286     if (LockingMode == LM_LEGACY) {
2287       Label not_recur;
2288       // Simple recursive lock?
2289       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2290       __ jcc(Assembler::notEqual, not_recur);
2291       __ dec_held_monitor_count();
2292       __ jmpb(fast_done);
2293       __ bind(not_recur);
2294     }
2295 
2296     // Must save rax if it is live now because cmpxchg must use it
2297     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2298       save_native_result(masm, ret_type, stack_slots);
2299     }
2300 
2301     if (LockingMode == LM_MONITOR) {
2302       __ jmp(slow_path_unlock);
2303     } else if (LockingMode == LM_LEGACY) {
2304       // get address of the stack lock
2305       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2306       //  get old displaced header
2307       __ movptr(old_hdr, Address(rax, 0));
2308 
2309       // Atomic swap old header if oop still contains the stack lock
2310       __ lock();
2311       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2312       __ jcc(Assembler::notEqual, slow_path_unlock);
2313       __ dec_held_monitor_count();
2314     } else {
2315       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2316       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2317       __ dec_held_monitor_count();
2318     }
2319 
2320     // slow path re-enters here
2321     __ bind(unlock_done);
2322     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2323       restore_native_result(masm, ret_type, stack_slots);
2324     }
2325 
2326     __ bind(fast_done);
2327   }
2328   {
2329     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2330     save_native_result(masm, ret_type, stack_slots);
2331     __ mov_metadata(c_rarg1, method());
2332     __ call_VM_leaf(
2333          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2334          r15_thread, c_rarg1);
2335     restore_native_result(masm, ret_type, stack_slots);
2336   }
2337 
2338   __ reset_last_Java_frame(false);
2339 
2340   // Unbox oop result, e.g. JNIHandles::resolve value.
2341   if (is_reference_type(ret_type)) {
2342     __ resolve_jobject(rax /* value */,
2343                        r15_thread /* thread */,
2344                        rcx /* tmp */);
2345   }
2346 
2347   if (CheckJNICalls) {
2348     // clear_pending_jni_exception_check
2349     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2350   }
2351 
2352   // reset handle block
2353   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2354   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2355 
2356   // pop our frame
2357 
2358   __ leave();
2359 
2360   // Any exception pending?
2361   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2362   __ jcc(Assembler::notEqual, exception_pending);
2363 
2364   // Return
2365 
2366   __ ret(0);
2367 
2368   // Unexpected paths are out of line and go here
2369 
2370   // forward the exception
2371   __ bind(exception_pending);
2372 
2373   // and forward the exception
2374   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2375 
2376   // Slow path locking & unlocking
2377   if (method->is_synchronized()) {
2378 
2379     // BEGIN Slow path lock
2380     __ bind(slow_path_lock);
2381 
2382     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2383     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2384 
2385     // protect the args we've loaded
2386     save_args(masm, total_c_args, c_arg, out_regs);
2387 
2388     __ mov(c_rarg0, obj_reg);
2389     __ mov(c_rarg1, lock_reg);
2390     __ mov(c_rarg2, r15_thread);
2391 
2392     // Not a leaf but we have last_Java_frame setup as we want
2393     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2394     restore_args(masm, total_c_args, c_arg, out_regs);
2395 
2396 #ifdef ASSERT
2397     { Label L;
2398     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2399     __ jcc(Assembler::equal, L);
2400     __ stop("no pending exception allowed on exit from monitorenter");
2401     __ bind(L);
2402     }
2403 #endif
2404     __ jmp(lock_done);
2405 
2406     // END Slow path lock
2407 
2408     // BEGIN Slow path unlock
2409     __ bind(slow_path_unlock);
2410 
2411     // If we haven't already saved the native result we must save it now as xmm registers
2412     // are still exposed.
2413     __ vzeroupper();
2414     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2415       save_native_result(masm, ret_type, stack_slots);
2416     }
2417 
2418     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2419 
2420     __ mov(c_rarg0, obj_reg);
2421     __ mov(c_rarg2, r15_thread);
2422     __ mov(r12, rsp); // remember sp
2423     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2424     __ andptr(rsp, -16); // align stack as required by ABI
2425 
2426     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2427     // NOTE that obj_reg == rbx currently
2428     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2429     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2430 
2431     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2432     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2433     __ mov(rsp, r12); // restore sp
2434     __ reinit_heapbase();
2435 #ifdef ASSERT
2436     {
2437       Label L;
2438       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2439       __ jcc(Assembler::equal, L);
2440       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2441       __ bind(L);
2442     }
2443 #endif /* ASSERT */
2444 
2445     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2446 
2447     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2448       restore_native_result(masm, ret_type, stack_slots);
2449     }
2450     __ jmp(unlock_done);
2451 
2452     // END Slow path unlock
2453 
2454   } // synchronized
2455 
2456   // SLOW PATH Reguard the stack if needed
2457 
2458   __ bind(reguard);
2459   __ vzeroupper();
2460   save_native_result(masm, ret_type, stack_slots);
2461   __ mov(r12, rsp); // remember sp
2462   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2463   __ andptr(rsp, -16); // align stack as required by ABI
2464   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2465   __ mov(rsp, r12); // restore sp
2466   __ reinit_heapbase();
2467   restore_native_result(masm, ret_type, stack_slots);
2468   // and continue
2469   __ jmp(reguard_done);
2470 
2471 
2472 
2473   __ flush();
2474 
2475   nmethod *nm = nmethod::new_native_nmethod(method,
2476                                             compile_id,
2477                                             masm->code(),
2478                                             vep_offset,
2479                                             frame_complete,
2480                                             stack_slots / VMRegImpl::slots_per_word,
2481                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2482                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2483                                             oop_maps);
2484 
2485   return nm;
2486 }
2487 
2488 // this function returns the adjust size (in number of words) to a c2i adapter
2489 // activation for use during deoptimization
2490 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2491   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2492 }
2493 
2494 
2495 uint SharedRuntime::out_preserve_stack_slots() {
2496   return 0;
2497 }
2498 
2499 
2500 // Number of stack slots between incoming argument block and the start of
2501 // a new frame.  The PROLOG must add this many slots to the stack.  The
2502 // EPILOG must remove this many slots.  amd64 needs two slots for
2503 // return address.
2504 uint SharedRuntime::in_preserve_stack_slots() {
2505   return 4 + 2 * VerifyStackAtCalls;
2506 }
2507 
2508 //------------------------------generate_deopt_blob----------------------------
2509 void SharedRuntime::generate_deopt_blob() {
2510   // Allocate space for the code
2511   ResourceMark rm;
2512   // Setup code generation tools
2513   int pad = 0;
2514   if (UseAVX > 2) {
2515     pad += 1024;
2516   }
2517 #if INCLUDE_JVMCI
2518   if (EnableJVMCI) {
2519     pad += 512; // Increase the buffer size when compiling for JVMCI
2520   }
2521 #endif
2522   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2523   MacroAssembler* masm = new MacroAssembler(&buffer);
2524   int frame_size_in_words;
2525   OopMap* map = nullptr;
2526   OopMapSet *oop_maps = new OopMapSet();
2527 
2528   // -------------
2529   // This code enters when returning to a de-optimized nmethod.  A return
2530   // address has been pushed on the stack, and return values are in
2531   // registers.
2532   // If we are doing a normal deopt then we were called from the patched
2533   // nmethod from the point we returned to the nmethod. So the return
2534   // address on the stack is wrong by NativeCall::instruction_size
2535   // We will adjust the value so it looks like we have the original return
2536   // address on the stack (like when we eagerly deoptimized).
2537   // In the case of an exception pending when deoptimizing, we enter
2538   // with a return address on the stack that points after the call we patched
2539   // into the exception handler. We have the following register state from,
2540   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2541   //    rax: exception oop
2542   //    rbx: exception handler
2543   //    rdx: throwing pc
2544   // So in this case we simply jam rdx into the useless return address and
2545   // the stack looks just like we want.
2546   //
2547   // At this point we need to de-opt.  We save the argument return
2548   // registers.  We call the first C routine, fetch_unroll_info().  This
2549   // routine captures the return values and returns a structure which
2550   // describes the current frame size and the sizes of all replacement frames.
2551   // The current frame is compiled code and may contain many inlined
2552   // functions, each with their own JVM state.  We pop the current frame, then
2553   // push all the new frames.  Then we call the C routine unpack_frames() to
2554   // populate these frames.  Finally unpack_frames() returns us the new target
2555   // address.  Notice that callee-save registers are BLOWN here; they have
2556   // already been captured in the vframeArray at the time the return PC was
2557   // patched.
2558   address start = __ pc();
2559   Label cont;
2560 
2561   // Prolog for non exception case!
2562 
2563   // Save everything in sight.
2564   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2565 
2566   // Normal deoptimization.  Save exec mode for unpack_frames.
2567   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2568   __ jmp(cont);
2569 
2570   int reexecute_offset = __ pc() - start;
2571 #if INCLUDE_JVMCI && !defined(COMPILER1)
2572   if (EnableJVMCI && UseJVMCICompiler) {
2573     // JVMCI does not use this kind of deoptimization
2574     __ should_not_reach_here();
2575   }
2576 #endif
2577 
2578   // Reexecute case
2579   // return address is the pc describes what bci to do re-execute at
2580 
2581   // No need to update map as each call to save_live_registers will produce identical oopmap
2582   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2583 
2584   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2585   __ jmp(cont);
2586 
2587 #if INCLUDE_JVMCI
2588   Label after_fetch_unroll_info_call;
2589   int implicit_exception_uncommon_trap_offset = 0;
2590   int uncommon_trap_offset = 0;
2591 
2592   if (EnableJVMCI) {
2593     implicit_exception_uncommon_trap_offset = __ pc() - start;
2594 
2595     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2596     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2597 
2598     uncommon_trap_offset = __ pc() - start;
2599 
2600     // Save everything in sight.
2601     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2602     // fetch_unroll_info needs to call last_java_frame()
2603     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2604 
2605     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2606     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2607 
2608     __ movl(r14, Deoptimization::Unpack_reexecute);
2609     __ mov(c_rarg0, r15_thread);
2610     __ movl(c_rarg2, r14); // exec mode
2611     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2612     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2613 
2614     __ reset_last_Java_frame(false);
2615 
2616     __ jmp(after_fetch_unroll_info_call);
2617   } // EnableJVMCI
2618 #endif // INCLUDE_JVMCI
2619 
2620   int exception_offset = __ pc() - start;
2621 
2622   // Prolog for exception case
2623 
2624   // all registers are dead at this entry point, except for rax, and
2625   // rdx which contain the exception oop and exception pc
2626   // respectively.  Set them in TLS and fall thru to the
2627   // unpack_with_exception_in_tls entry point.
2628 
2629   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2630   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2631 
2632   int exception_in_tls_offset = __ pc() - start;
2633 
2634   // new implementation because exception oop is now passed in JavaThread
2635 
2636   // Prolog for exception case
2637   // All registers must be preserved because they might be used by LinearScan
2638   // Exceptiop oop and throwing PC are passed in JavaThread
2639   // tos: stack at point of call to method that threw the exception (i.e. only
2640   // args are on the stack, no return address)
2641 
2642   // make room on stack for the return address
2643   // It will be patched later with the throwing pc. The correct value is not
2644   // available now because loading it from memory would destroy registers.
2645   __ push(0);
2646 
2647   // Save everything in sight.
2648   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2649 
2650   // Now it is safe to overwrite any register
2651 
2652   // Deopt during an exception.  Save exec mode for unpack_frames.
2653   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2654 
2655   // load throwing pc from JavaThread and patch it as the return address
2656   // of the current frame. Then clear the field in JavaThread
2657 
2658   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2659   __ movptr(Address(rbp, wordSize), rdx);
2660   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2661 
2662 #ifdef ASSERT
2663   // verify that there is really an exception oop in JavaThread
2664   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2665   __ verify_oop(rax);
2666 
2667   // verify that there is no pending exception
2668   Label no_pending_exception;
2669   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2670   __ testptr(rax, rax);
2671   __ jcc(Assembler::zero, no_pending_exception);
2672   __ stop("must not have pending exception here");
2673   __ bind(no_pending_exception);
2674 #endif
2675 
2676   __ bind(cont);
2677 
2678   // Call C code.  Need thread and this frame, but NOT official VM entry
2679   // crud.  We cannot block on this call, no GC can happen.
2680   //
2681   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2682 
2683   // fetch_unroll_info needs to call last_java_frame().
2684 
2685   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2686 #ifdef ASSERT
2687   { Label L;
2688     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2689     __ jcc(Assembler::equal, L);
2690     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2691     __ bind(L);
2692   }
2693 #endif // ASSERT
2694   __ mov(c_rarg0, r15_thread);
2695   __ movl(c_rarg1, r14); // exec_mode
2696   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2697 
2698   // Need to have an oopmap that tells fetch_unroll_info where to
2699   // find any register it might need.
2700   oop_maps->add_gc_map(__ pc() - start, map);
2701 
2702   __ reset_last_Java_frame(false);
2703 
2704 #if INCLUDE_JVMCI
2705   if (EnableJVMCI) {
2706     __ bind(after_fetch_unroll_info_call);
2707   }
2708 #endif
2709 
2710   // Load UnrollBlock* into rdi
2711   __ mov(rdi, rax);
2712 
2713   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2714    Label noException;
2715   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2716   __ jcc(Assembler::notEqual, noException);
2717   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2718   // QQQ this is useless it was null above
2719   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2720   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2721   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2722 
2723   __ verify_oop(rax);
2724 
2725   // Overwrite the result registers with the exception results.
2726   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2727   // I think this is useless
2728   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2729 
2730   __ bind(noException);
2731 
2732   // Only register save data is on the stack.
2733   // Now restore the result registers.  Everything else is either dead
2734   // or captured in the vframeArray.
2735   RegisterSaver::restore_result_registers(masm);
2736 
2737   // All of the register save area has been popped of the stack. Only the
2738   // return address remains.
2739 
2740   // Pop all the frames we must move/replace.
2741   //
2742   // Frame picture (youngest to oldest)
2743   // 1: self-frame (no frame link)
2744   // 2: deopting frame  (no frame link)
2745   // 3: caller of deopting frame (could be compiled/interpreted).
2746   //
2747   // Note: by leaving the return address of self-frame on the stack
2748   // and using the size of frame 2 to adjust the stack
2749   // when we are done the return to frame 3 will still be on the stack.
2750 
2751   // Pop deoptimized frame
2752   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2753   __ addptr(rsp, rcx);
2754 
2755   // rsp should be pointing at the return address to the caller (3)
2756 
2757   // Pick up the initial fp we should save
2758   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2759   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2760 
2761 #ifdef ASSERT
2762   // Compilers generate code that bang the stack by as much as the
2763   // interpreter would need. So this stack banging should never
2764   // trigger a fault. Verify that it does not on non product builds.
2765   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2766   __ bang_stack_size(rbx, rcx);
2767 #endif
2768 
2769   // Load address of array of frame pcs into rcx
2770   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2771 
2772   // Trash the old pc
2773   __ addptr(rsp, wordSize);
2774 
2775   // Load address of array of frame sizes into rsi
2776   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2777 
2778   // Load counter into rdx
2779   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2780 
2781   // Now adjust the caller's stack to make up for the extra locals
2782   // but record the original sp so that we can save it in the skeletal interpreter
2783   // frame and the stack walking of interpreter_sender will get the unextended sp
2784   // value and not the "real" sp value.
2785 
2786   const Register sender_sp = r8;
2787 
2788   __ mov(sender_sp, rsp);
2789   __ movl(rbx, Address(rdi,
2790                        Deoptimization::UnrollBlock::
2791                        caller_adjustment_offset()));
2792   __ subptr(rsp, rbx);
2793 
2794   // Push interpreter frames in a loop
2795   Label loop;
2796   __ bind(loop);
2797   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2798   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2799   __ pushptr(Address(rcx, 0));          // Save return address
2800   __ enter();                           // Save old & set new ebp
2801   __ subptr(rsp, rbx);                  // Prolog
2802   // This value is corrected by layout_activation_impl
2803   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2804   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2805   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2806   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2807   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2808   __ decrementl(rdx);                   // Decrement counter
2809   __ jcc(Assembler::notZero, loop);
2810   __ pushptr(Address(rcx, 0));          // Save final return address
2811 
2812   // Re-push self-frame
2813   __ enter();                           // Save old & set new ebp
2814 
2815   // Allocate a full sized register save area.
2816   // Return address and rbp are in place, so we allocate two less words.
2817   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2818 
2819   // Restore frame locals after moving the frame
2820   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2821   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2822 
2823   // Call C code.  Need thread but NOT official VM entry
2824   // crud.  We cannot block on this call, no GC can happen.  Call should
2825   // restore return values to their stack-slots with the new SP.
2826   //
2827   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2828 
2829   // Use rbp because the frames look interpreted now
2830   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2831   // Don't need the precise return PC here, just precise enough to point into this code blob.
2832   address the_pc = __ pc();
2833   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2834 
2835   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2836   __ mov(c_rarg0, r15_thread);
2837   __ movl(c_rarg1, r14); // second arg: exec_mode
2838   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2839   // Revert SP alignment after call since we're going to do some SP relative addressing below
2840   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2841 
2842   // Set an oopmap for the call site
2843   // Use the same PC we used for the last java frame
2844   oop_maps->add_gc_map(the_pc - start,
2845                        new OopMap( frame_size_in_words, 0 ));
2846 
2847   // Clear fp AND pc
2848   __ reset_last_Java_frame(true);
2849 
2850   // Collect return values
2851   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2852   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2853   // I think this is useless (throwing pc?)
2854   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2855 
2856   // Pop self-frame.
2857   __ leave();                           // Epilog
2858 
2859   // Jump to interpreter
2860   __ ret(0);
2861 
2862   // Make sure all code is generated
2863   masm->flush();
2864 
2865   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2866   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2867 #if INCLUDE_JVMCI
2868   if (EnableJVMCI) {
2869     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2870     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2871   }
2872 #endif
2873 }
2874 
2875 #ifdef COMPILER2
2876 //------------------------------generate_uncommon_trap_blob--------------------
2877 void SharedRuntime::generate_uncommon_trap_blob() {
2878   // Allocate space for the code
2879   ResourceMark rm;
2880   // Setup code generation tools
2881   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2882   MacroAssembler* masm = new MacroAssembler(&buffer);
2883 
2884   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2885 
2886   address start = __ pc();
2887 
2888   if (UseRTMLocking) {
2889     // Abort RTM transaction before possible nmethod deoptimization.
2890     __ xabort(0);
2891   }
2892 
2893   // Push self-frame.  We get here with a return address on the
2894   // stack, so rsp is 8-byte aligned until we allocate our frame.
2895   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2896 
2897   // No callee saved registers. rbp is assumed implicitly saved
2898   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2899 
2900   // compiler left unloaded_class_index in j_rarg0 move to where the
2901   // runtime expects it.
2902   __ movl(c_rarg1, j_rarg0);
2903 
2904   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2905 
2906   // Call C code.  Need thread but NOT official VM entry
2907   // crud.  We cannot block on this call, no GC can happen.  Call should
2908   // capture callee-saved registers as well as return values.
2909   // Thread is in rdi already.
2910   //
2911   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2912 
2913   __ mov(c_rarg0, r15_thread);
2914   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2915   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2916 
2917   // Set an oopmap for the call site
2918   OopMapSet* oop_maps = new OopMapSet();
2919   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2920 
2921   // location of rbp is known implicitly by the frame sender code
2922 
2923   oop_maps->add_gc_map(__ pc() - start, map);
2924 
2925   __ reset_last_Java_frame(false);
2926 
2927   // Load UnrollBlock* into rdi
2928   __ mov(rdi, rax);
2929 
2930 #ifdef ASSERT
2931   { Label L;
2932     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
2933               Deoptimization::Unpack_uncommon_trap);
2934     __ jcc(Assembler::equal, L);
2935     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
2936     __ bind(L);
2937   }
2938 #endif
2939 
2940   // Pop all the frames we must move/replace.
2941   //
2942   // Frame picture (youngest to oldest)
2943   // 1: self-frame (no frame link)
2944   // 2: deopting frame  (no frame link)
2945   // 3: caller of deopting frame (could be compiled/interpreted).
2946 
2947   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2948   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2949 
2950   // Pop deoptimized frame (int)
2951   __ movl(rcx, Address(rdi,
2952                        Deoptimization::UnrollBlock::
2953                        size_of_deoptimized_frame_offset()));
2954   __ addptr(rsp, rcx);
2955 
2956   // rsp should be pointing at the return address to the caller (3)
2957 
2958   // Pick up the initial fp we should save
2959   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2960   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2961 
2962 #ifdef ASSERT
2963   // Compilers generate code that bang the stack by as much as the
2964   // interpreter would need. So this stack banging should never
2965   // trigger a fault. Verify that it does not on non product builds.
2966   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2967   __ bang_stack_size(rbx, rcx);
2968 #endif
2969 
2970   // Load address of array of frame pcs into rcx (address*)
2971   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2972 
2973   // Trash the return pc
2974   __ addptr(rsp, wordSize);
2975 
2976   // Load address of array of frame sizes into rsi (intptr_t*)
2977   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
2978 
2979   // Counter
2980   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
2981 
2982   // Now adjust the caller's stack to make up for the extra locals but
2983   // record the original sp so that we can save it in the skeletal
2984   // interpreter frame and the stack walking of interpreter_sender
2985   // will get the unextended sp value and not the "real" sp value.
2986 
2987   const Register sender_sp = r8;
2988 
2989   __ mov(sender_sp, rsp);
2990   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
2991   __ subptr(rsp, rbx);
2992 
2993   // Push interpreter frames in a loop
2994   Label loop;
2995   __ bind(loop);
2996   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2997   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2998   __ pushptr(Address(rcx, 0));     // Save return address
2999   __ enter();                      // Save old & set new rbp
3000   __ subptr(rsp, rbx);             // Prolog
3001   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3002             sender_sp);            // Make it walkable
3003   // This value is corrected by layout_activation_impl
3004   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3005   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3006   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3007   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3008   __ decrementl(rdx);              // Decrement counter
3009   __ jcc(Assembler::notZero, loop);
3010   __ pushptr(Address(rcx, 0));     // Save final return address
3011 
3012   // Re-push self-frame
3013   __ enter();                 // Save old & set new rbp
3014   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3015                               // Prolog
3016 
3017   // Use rbp because the frames look interpreted now
3018   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3019   // Don't need the precise return PC here, just precise enough to point into this code blob.
3020   address the_pc = __ pc();
3021   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3022 
3023   // Call C code.  Need thread but NOT official VM entry
3024   // crud.  We cannot block on this call, no GC can happen.  Call should
3025   // restore return values to their stack-slots with the new SP.
3026   // Thread is in rdi already.
3027   //
3028   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3029 
3030   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3031   __ mov(c_rarg0, r15_thread);
3032   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3033   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3034 
3035   // Set an oopmap for the call site
3036   // Use the same PC we used for the last java frame
3037   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3038 
3039   // Clear fp AND pc
3040   __ reset_last_Java_frame(true);
3041 
3042   // Pop self-frame.
3043   __ leave();                 // Epilog
3044 
3045   // Jump to interpreter
3046   __ ret(0);
3047 
3048   // Make sure all code is generated
3049   masm->flush();
3050 
3051   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3052                                                  SimpleRuntimeFrame::framesize >> 1);
3053 }
3054 #endif // COMPILER2
3055 
3056 //------------------------------generate_handler_blob------
3057 //
3058 // Generate a special Compile2Runtime blob that saves all registers,
3059 // and setup oopmap.
3060 //
3061 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3062   assert(StubRoutines::forward_exception_entry() != nullptr,
3063          "must be generated before");
3064 
3065   ResourceMark rm;
3066   OopMapSet *oop_maps = new OopMapSet();
3067   OopMap* map;
3068 
3069   // Allocate space for the code.  Setup code generation tools.
3070   CodeBuffer buffer("handler_blob", 2048, 1024);
3071   MacroAssembler* masm = new MacroAssembler(&buffer);
3072 
3073   address start   = __ pc();
3074   address call_pc = nullptr;
3075   int frame_size_in_words;
3076   bool cause_return = (poll_type == POLL_AT_RETURN);
3077   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3078 
3079   if (UseRTMLocking) {
3080     // Abort RTM transaction before calling runtime
3081     // because critical section will be large and will be
3082     // aborted anyway. Also nmethod could be deoptimized.
3083     __ xabort(0);
3084   }
3085 
3086   // Make room for return address (or push it again)
3087   if (!cause_return) {
3088     __ push(rbx);
3089   }
3090 
3091   // Save registers, fpu state, and flags
3092   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3093 
3094   // The following is basically a call_VM.  However, we need the precise
3095   // address of the call in order to generate an oopmap. Hence, we do all the
3096   // work ourselves.
3097 
3098   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3099 
3100   // The return address must always be correct so that frame constructor never
3101   // sees an invalid pc.
3102 
3103   if (!cause_return) {
3104     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3105     // Additionally, rbx is a callee saved register and we can look at it later to determine
3106     // if someone changed the return address for us!
3107     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3108     __ movptr(Address(rbp, wordSize), rbx);
3109   }
3110 
3111   // Do the call
3112   __ mov(c_rarg0, r15_thread);
3113   __ call(RuntimeAddress(call_ptr));
3114 
3115   // Set an oopmap for the call site.  This oopmap will map all
3116   // oop-registers and debug-info registers as callee-saved.  This
3117   // will allow deoptimization at this safepoint to find all possible
3118   // debug-info recordings, as well as let GC find all oops.
3119 
3120   oop_maps->add_gc_map( __ pc() - start, map);
3121 
3122   Label noException;
3123 
3124   __ reset_last_Java_frame(false);
3125 
3126   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3127   __ jcc(Assembler::equal, noException);
3128 
3129   // Exception pending
3130 
3131   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3132 
3133   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3134 
3135   // No exception case
3136   __ bind(noException);
3137 
3138   Label no_adjust;
3139 #ifdef ASSERT
3140   Label bail;
3141 #endif
3142   if (!cause_return) {
3143     Label no_prefix, not_special;
3144 
3145     // If our stashed return pc was modified by the runtime we avoid touching it
3146     __ cmpptr(rbx, Address(rbp, wordSize));
3147     __ jccb(Assembler::notEqual, no_adjust);
3148 
3149     // Skip over the poll instruction.
3150     // See NativeInstruction::is_safepoint_poll()
3151     // Possible encodings:
3152     //      85 00       test   %eax,(%rax)
3153     //      85 01       test   %eax,(%rcx)
3154     //      85 02       test   %eax,(%rdx)
3155     //      85 03       test   %eax,(%rbx)
3156     //      85 06       test   %eax,(%rsi)
3157     //      85 07       test   %eax,(%rdi)
3158     //
3159     //   41 85 00       test   %eax,(%r8)
3160     //   41 85 01       test   %eax,(%r9)
3161     //   41 85 02       test   %eax,(%r10)
3162     //   41 85 03       test   %eax,(%r11)
3163     //   41 85 06       test   %eax,(%r14)
3164     //   41 85 07       test   %eax,(%r15)
3165     //
3166     //      85 04 24    test   %eax,(%rsp)
3167     //   41 85 04 24    test   %eax,(%r12)
3168     //      85 45 00    test   %eax,0x0(%rbp)
3169     //   41 85 45 00    test   %eax,0x0(%r13)
3170 
3171     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3172     __ jcc(Assembler::notEqual, no_prefix);
3173     __ addptr(rbx, 1);
3174     __ bind(no_prefix);
3175 #ifdef ASSERT
3176     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3177 #endif
3178     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3179     // r12/rsp 0x04
3180     // r13/rbp 0x05
3181     __ movzbq(rcx, Address(rbx, 1));
3182     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3183     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3184     __ cmpptr(rcx, 1);
3185     __ jcc(Assembler::above, not_special);
3186     __ addptr(rbx, 1);
3187     __ bind(not_special);
3188 #ifdef ASSERT
3189     // Verify the correct encoding of the poll we're about to skip.
3190     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3191     __ jcc(Assembler::notEqual, bail);
3192     // Mask out the modrm bits
3193     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3194     // rax encodes to 0, so if the bits are nonzero it's incorrect
3195     __ jcc(Assembler::notZero, bail);
3196 #endif
3197     // Adjust return pc forward to step over the safepoint poll instruction
3198     __ addptr(rbx, 2);
3199     __ movptr(Address(rbp, wordSize), rbx);
3200   }
3201 
3202   __ bind(no_adjust);
3203   // Normal exit, restore registers and exit.
3204   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3205   __ ret(0);
3206 
3207 #ifdef ASSERT
3208   __ bind(bail);
3209   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3210 #endif
3211 
3212   // Make sure all code is generated
3213   masm->flush();
3214 
3215   // Fill-out other meta info
3216   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3217 }
3218 
3219 //
3220 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3221 //
3222 // Generate a stub that calls into vm to find out the proper destination
3223 // of a java call. All the argument registers are live at this point
3224 // but since this is generic code we don't know what they are and the caller
3225 // must do any gc of the args.
3226 //
3227 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3228   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3229 
3230   // allocate space for the code
3231   ResourceMark rm;
3232 
3233   CodeBuffer buffer(name, 1200, 512);
3234   MacroAssembler* masm = new MacroAssembler(&buffer);
3235 
3236   int frame_size_in_words;
3237 
3238   OopMapSet *oop_maps = new OopMapSet();
3239   OopMap* map = nullptr;
3240 
3241   int start = __ offset();
3242 
3243   // No need to save vector registers since they are caller-saved anyway.
3244   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3245 
3246   int frame_complete = __ offset();
3247 
3248   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3249 
3250   __ mov(c_rarg0, r15_thread);
3251 
3252   __ call(RuntimeAddress(destination));
3253 
3254 
3255   // Set an oopmap for the call site.
3256   // We need this not only for callee-saved registers, but also for volatile
3257   // registers that the compiler might be keeping live across a safepoint.
3258 
3259   oop_maps->add_gc_map( __ offset() - start, map);
3260 
3261   // rax contains the address we are going to jump to assuming no exception got installed
3262 
3263   // clear last_Java_sp
3264   __ reset_last_Java_frame(false);
3265   // check for pending exceptions
3266   Label pending;
3267   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3268   __ jcc(Assembler::notEqual, pending);
3269 
3270   // get the returned Method*
3271   __ get_vm_result_2(rbx, r15_thread);
3272   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3273 
3274   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3275 
3276   RegisterSaver::restore_live_registers(masm);
3277 
3278   // We are back to the original state on entry and ready to go.
3279 
3280   __ jmp(rax);
3281 
3282   // Pending exception after the safepoint
3283 
3284   __ bind(pending);
3285 
3286   RegisterSaver::restore_live_registers(masm);
3287 
3288   // exception pending => remove activation and forward to exception handler
3289 
3290   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3291 
3292   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3293   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3294 
3295   // -------------
3296   // make sure all code is generated
3297   masm->flush();
3298 
3299   // return the  blob
3300   // frame_size_words or bytes??
3301   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3302 }
3303 
3304 //------------------------------Montgomery multiplication------------------------
3305 //
3306 
3307 #ifndef _WINDOWS
3308 
3309 // Subtract 0:b from carry:a.  Return carry.
3310 static julong
3311 sub(julong a[], julong b[], julong carry, long len) {
3312   long long i = 0, cnt = len;
3313   julong tmp;
3314   asm volatile("clc; "
3315                "0: ; "
3316                "mov (%[b], %[i], 8), %[tmp]; "
3317                "sbb %[tmp], (%[a], %[i], 8); "
3318                "inc %[i]; dec %[cnt]; "
3319                "jne 0b; "
3320                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3321                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3322                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3323                : "memory");
3324   return tmp;
3325 }
3326 
3327 // Multiply (unsigned) Long A by Long B, accumulating the double-
3328 // length result into the accumulator formed of T0, T1, and T2.
3329 #define MACC(A, B, T0, T1, T2)                                  \
3330 do {                                                            \
3331   unsigned long hi, lo;                                         \
3332   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3333            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3334            : "r"(A), "a"(B) : "cc");                            \
3335  } while(0)
3336 
3337 // As above, but add twice the double-length result into the
3338 // accumulator.
3339 #define MACC2(A, B, T0, T1, T2)                                 \
3340 do {                                                            \
3341   unsigned long hi, lo;                                         \
3342   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3343            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3344            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3345            : "r"(A), "a"(B) : "cc");                            \
3346  } while(0)
3347 
3348 #else //_WINDOWS
3349 
3350 static julong
3351 sub(julong a[], julong b[], julong carry, long len) {
3352   long i;
3353   julong tmp;
3354   unsigned char c = 1;
3355   for (i = 0; i < len; i++) {
3356     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3357     a[i] = tmp;
3358   }
3359   c = _addcarry_u64(c, carry, ~0, &tmp);
3360   return tmp;
3361 }
3362 
3363 // Multiply (unsigned) Long A by Long B, accumulating the double-
3364 // length result into the accumulator formed of T0, T1, and T2.
3365 #define MACC(A, B, T0, T1, T2)                          \
3366 do {                                                    \
3367   julong hi, lo;                            \
3368   lo = _umul128(A, B, &hi);                             \
3369   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3370   c = _addcarry_u64(c, hi, T1, &T1);                    \
3371   _addcarry_u64(c, T2, 0, &T2);                         \
3372  } while(0)
3373 
3374 // As above, but add twice the double-length result into the
3375 // accumulator.
3376 #define MACC2(A, B, T0, T1, T2)                         \
3377 do {                                                    \
3378   julong hi, lo;                            \
3379   lo = _umul128(A, B, &hi);                             \
3380   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3381   c = _addcarry_u64(c, hi, T1, &T1);                    \
3382   _addcarry_u64(c, T2, 0, &T2);                         \
3383   c = _addcarry_u64(0, lo, T0, &T0);                    \
3384   c = _addcarry_u64(c, hi, T1, &T1);                    \
3385   _addcarry_u64(c, T2, 0, &T2);                         \
3386  } while(0)
3387 
3388 #endif //_WINDOWS
3389 
3390 // Fast Montgomery multiplication.  The derivation of the algorithm is
3391 // in  A Cryptographic Library for the Motorola DSP56000,
3392 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3393 
3394 static void NOINLINE
3395 montgomery_multiply(julong a[], julong b[], julong n[],
3396                     julong m[], julong inv, int len) {
3397   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3398   int i;
3399 
3400   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3401 
3402   for (i = 0; i < len; i++) {
3403     int j;
3404     for (j = 0; j < i; j++) {
3405       MACC(a[j], b[i-j], t0, t1, t2);
3406       MACC(m[j], n[i-j], t0, t1, t2);
3407     }
3408     MACC(a[i], b[0], t0, t1, t2);
3409     m[i] = t0 * inv;
3410     MACC(m[i], n[0], t0, t1, t2);
3411 
3412     assert(t0 == 0, "broken Montgomery multiply");
3413 
3414     t0 = t1; t1 = t2; t2 = 0;
3415   }
3416 
3417   for (i = len; i < 2*len; i++) {
3418     int j;
3419     for (j = i-len+1; j < len; j++) {
3420       MACC(a[j], b[i-j], t0, t1, t2);
3421       MACC(m[j], n[i-j], t0, t1, t2);
3422     }
3423     m[i-len] = t0;
3424     t0 = t1; t1 = t2; t2 = 0;
3425   }
3426 
3427   while (t0)
3428     t0 = sub(m, n, t0, len);
3429 }
3430 
3431 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3432 // multiplies so it should be up to 25% faster than Montgomery
3433 // multiplication.  However, its loop control is more complex and it
3434 // may actually run slower on some machines.
3435 
3436 static void NOINLINE
3437 montgomery_square(julong a[], julong n[],
3438                   julong m[], julong inv, int len) {
3439   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3440   int i;
3441 
3442   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3443 
3444   for (i = 0; i < len; i++) {
3445     int j;
3446     int end = (i+1)/2;
3447     for (j = 0; j < end; j++) {
3448       MACC2(a[j], a[i-j], t0, t1, t2);
3449       MACC(m[j], n[i-j], t0, t1, t2);
3450     }
3451     if ((i & 1) == 0) {
3452       MACC(a[j], a[j], t0, t1, t2);
3453     }
3454     for (; j < i; j++) {
3455       MACC(m[j], n[i-j], t0, t1, t2);
3456     }
3457     m[i] = t0 * inv;
3458     MACC(m[i], n[0], t0, t1, t2);
3459 
3460     assert(t0 == 0, "broken Montgomery square");
3461 
3462     t0 = t1; t1 = t2; t2 = 0;
3463   }
3464 
3465   for (i = len; i < 2*len; i++) {
3466     int start = i-len+1;
3467     int end = start + (len - start)/2;
3468     int j;
3469     for (j = start; j < end; j++) {
3470       MACC2(a[j], a[i-j], t0, t1, t2);
3471       MACC(m[j], n[i-j], t0, t1, t2);
3472     }
3473     if ((i & 1) == 0) {
3474       MACC(a[j], a[j], t0, t1, t2);
3475     }
3476     for (; j < len; j++) {
3477       MACC(m[j], n[i-j], t0, t1, t2);
3478     }
3479     m[i-len] = t0;
3480     t0 = t1; t1 = t2; t2 = 0;
3481   }
3482 
3483   while (t0)
3484     t0 = sub(m, n, t0, len);
3485 }
3486 
3487 // Swap words in a longword.
3488 static julong swap(julong x) {
3489   return (x << 32) | (x >> 32);
3490 }
3491 
3492 // Copy len longwords from s to d, word-swapping as we go.  The
3493 // destination array is reversed.
3494 static void reverse_words(julong *s, julong *d, int len) {
3495   d += len;
3496   while(len-- > 0) {
3497     d--;
3498     *d = swap(*s);
3499     s++;
3500   }
3501 }
3502 
3503 // The threshold at which squaring is advantageous was determined
3504 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3505 #define MONTGOMERY_SQUARING_THRESHOLD 64
3506 
3507 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3508                                         jint len, jlong inv,
3509                                         jint *m_ints) {
3510   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3511   int longwords = len/2;
3512 
3513   // Make very sure we don't use so much space that the stack might
3514   // overflow.  512 jints corresponds to an 16384-bit integer and
3515   // will use here a total of 8k bytes of stack space.
3516   int divisor = sizeof(julong) * 4;
3517   guarantee(longwords <= 8192 / divisor, "must be");
3518   int total_allocation = longwords * sizeof (julong) * 4;
3519   julong *scratch = (julong *)alloca(total_allocation);
3520 
3521   // Local scratch arrays
3522   julong
3523     *a = scratch + 0 * longwords,
3524     *b = scratch + 1 * longwords,
3525     *n = scratch + 2 * longwords,
3526     *m = scratch + 3 * longwords;
3527 
3528   reverse_words((julong *)a_ints, a, longwords);
3529   reverse_words((julong *)b_ints, b, longwords);
3530   reverse_words((julong *)n_ints, n, longwords);
3531 
3532   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3533 
3534   reverse_words(m, (julong *)m_ints, longwords);
3535 }
3536 
3537 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3538                                       jint len, jlong inv,
3539                                       jint *m_ints) {
3540   assert(len % 2 == 0, "array length in montgomery_square must be even");
3541   int longwords = len/2;
3542 
3543   // Make very sure we don't use so much space that the stack might
3544   // overflow.  512 jints corresponds to an 16384-bit integer and
3545   // will use here a total of 6k bytes of stack space.
3546   int divisor = sizeof(julong) * 3;
3547   guarantee(longwords <= (8192 / divisor), "must be");
3548   int total_allocation = longwords * sizeof (julong) * 3;
3549   julong *scratch = (julong *)alloca(total_allocation);
3550 
3551   // Local scratch arrays
3552   julong
3553     *a = scratch + 0 * longwords,
3554     *n = scratch + 1 * longwords,
3555     *m = scratch + 2 * longwords;
3556 
3557   reverse_words((julong *)a_ints, a, longwords);
3558   reverse_words((julong *)n_ints, n, longwords);
3559 
3560   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3561     ::montgomery_square(a, n, m, (julong)inv, longwords);
3562   } else {
3563     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3564   }
3565 
3566   reverse_words(m, (julong *)m_ints, longwords);
3567 }
3568 
3569 #ifdef COMPILER2
3570 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3571 //
3572 //------------------------------generate_exception_blob---------------------------
3573 // creates exception blob at the end
3574 // Using exception blob, this code is jumped from a compiled method.
3575 // (see emit_exception_handler in x86_64.ad file)
3576 //
3577 // Given an exception pc at a call we call into the runtime for the
3578 // handler in this method. This handler might merely restore state
3579 // (i.e. callee save registers) unwind the frame and jump to the
3580 // exception handler for the nmethod if there is no Java level handler
3581 // for the nmethod.
3582 //
3583 // This code is entered with a jmp.
3584 //
3585 // Arguments:
3586 //   rax: exception oop
3587 //   rdx: exception pc
3588 //
3589 // Results:
3590 //   rax: exception oop
3591 //   rdx: exception pc in caller or ???
3592 //   destination: exception handler of caller
3593 //
3594 // Note: the exception pc MUST be at a call (precise debug information)
3595 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3596 //
3597 
3598 void OptoRuntime::generate_exception_blob() {
3599   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3600   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3601   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3602 
3603   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3604 
3605   // Allocate space for the code
3606   ResourceMark rm;
3607   // Setup code generation tools
3608   CodeBuffer buffer("exception_blob", 2048, 1024);
3609   int pc_offset = 0;
3610   if (SCCache::load_exception_blob(&buffer, &pc_offset)) {
3611     OopMapSet* oop_maps = new OopMapSet();
3612     oop_maps->add_gc_map(pc_offset, new OopMap(SimpleRuntimeFrame::framesize, 0));
3613 
3614     // Set exception blob
3615     _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3616     return;
3617   }
3618 
3619   MacroAssembler* masm = new MacroAssembler(&buffer);
3620   address start = __ pc();
3621 
3622   // Exception pc is 'return address' for stack walker
3623   __ push(rdx);
3624   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3625 
3626   // Save callee-saved registers.  See x86_64.ad.
3627 
3628   // rbp is an implicitly saved callee saved register (i.e., the calling
3629   // convention will save/restore it in the prolog/epilog). Other than that
3630   // there are no callee save registers now that adapter frames are gone.
3631 
3632   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3633 
3634   // Store exception in Thread object. We cannot pass any arguments to the
3635   // handle_exception call, since we do not want to make any assumption
3636   // about the size of the frame where the exception happened in.
3637   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3638   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3639   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3640 
3641   // This call does all the hard work.  It checks if an exception handler
3642   // exists in the method.
3643   // If so, it returns the handler address.
3644   // If not, it prepares for stack-unwinding, restoring the callee-save
3645   // registers of the frame being removed.
3646   //
3647   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3648 
3649   // At a method handle call, the stack may not be properly aligned
3650   // when returning with an exception.
3651   address the_pc = __ pc();
3652   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3653   __ mov(c_rarg0, r15_thread);
3654   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3655   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3656 
3657   // Set an oopmap for the call site.  This oopmap will only be used if we
3658   // are unwinding the stack.  Hence, all locations will be dead.
3659   // Callee-saved registers will be the same as the frame above (i.e.,
3660   // handle_exception_stub), since they were restored when we got the
3661   // exception.
3662 
3663   OopMapSet* oop_maps = new OopMapSet();
3664 
3665   pc_offset = the_pc - start;
3666   oop_maps->add_gc_map(pc_offset, new OopMap(SimpleRuntimeFrame::framesize, 0));
3667 
3668   __ reset_last_Java_frame(false);
3669 
3670   // Restore callee-saved registers
3671 
3672   // rbp is an implicitly saved callee-saved register (i.e., the calling
3673   // convention will save restore it in prolog/epilog) Other than that
3674   // there are no callee save registers now that adapter frames are gone.
3675 
3676   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3677 
3678   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3679   __ pop(rdx);                  // No need for exception pc anymore
3680 
3681   // rax: exception handler
3682 
3683   // We have a handler in rax (could be deopt blob).
3684   __ mov(r8, rax);
3685 
3686   // Get the exception oop
3687   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3688   // Get the exception pc in case we are deoptimized
3689   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3690 #ifdef ASSERT
3691   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3692   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3693 #endif
3694   // Clear the exception oop so GC no longer processes it as a root.
3695   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3696 
3697   // rax: exception oop
3698   // r8:  exception handler
3699   // rdx: exception pc
3700   // Jump to handler
3701 
3702   __ jmp(r8);
3703 
3704   // Make sure all code is generated
3705   masm->flush();
3706 
3707   SCCache::store_exception_blob(&buffer, pc_offset);
3708   // Set exception blob
3709   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3710 }
3711 #endif // COMPILER2
3712