1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/vframeArray.hpp"
  56 #include "runtime/vm_version.hpp"
  57 #include "utilities/align.hpp"
  58 #include "utilities/checkedCast.hpp"
  59 #include "utilities/formatBuffer.hpp"
  60 #include "vmreg_x86.inline.hpp"
  61 #ifdef COMPILER1
  62 #include "c1/c1_Runtime1.hpp"
  63 #endif
  64 #ifdef COMPILER2
  65 #include "opto/runtime.hpp"
  66 #endif
  67 #if INCLUDE_JVMCI
  68 #include "jvmci/jvmciJavaClasses.hpp"
  69 #endif
  70 
  71 #define __ masm->
  72 
  73 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  74 
  75 class SimpleRuntimeFrame {
  76 
  77   public:
  78 
  79   // Most of the runtime stubs have this simple frame layout.
  80   // This class exists to make the layout shared in one place.
  81   // Offsets are for compiler stack slots, which are jints.
  82   enum layout {
  83     // The frame sender code expects that rbp will be in the "natural" place and
  84     // will override any oopMap setting for it. We must therefore force the layout
  85     // so that it agrees with the frame sender code.
  86     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  87     rbp_off2,
  88     return_off, return_off2,
  89     framesize
  90   };
  91 };
  92 
  93 class RegisterSaver {
  94   // Capture info about frame layout.  Layout offsets are in jint
  95   // units because compiler frame slots are jints.
  96 #define XSAVE_AREA_BEGIN 160
  97 #define XSAVE_AREA_YMM_BEGIN 576
  98 #define XSAVE_AREA_OPMASK_BEGIN 1088
  99 #define XSAVE_AREA_ZMM_BEGIN 1152
 100 #define XSAVE_AREA_UPPERBANK 1664
 101 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 102 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 103 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 104 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 105 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 106   enum layout {
 107     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 108     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 109     DEF_XMM_OFFS(0),
 110     DEF_XMM_OFFS(1),
 111     // 2..15 are implied in range usage
 112     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 113     DEF_YMM_OFFS(0),
 114     DEF_YMM_OFFS(1),
 115     // 2..15 are implied in range usage
 116     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 117     DEF_OPMASK_OFFS(0),
 118     DEF_OPMASK_OFFS(1),
 119     // 2..7 are implied in range usage
 120     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 121     DEF_ZMM_OFFS(0),
 122     DEF_ZMM_OFFS(1),
 123     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 124     DEF_ZMM_UPPER_OFFS(16),
 125     DEF_ZMM_UPPER_OFFS(17),
 126     // 18..31 are implied in range usage
 127     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 128     fpu_stateH_end,
 129     r15_off, r15H_off,
 130     r14_off, r14H_off,
 131     r13_off, r13H_off,
 132     r12_off, r12H_off,
 133     r11_off, r11H_off,
 134     r10_off, r10H_off,
 135     r9_off,  r9H_off,
 136     r8_off,  r8H_off,
 137     rdi_off, rdiH_off,
 138     rsi_off, rsiH_off,
 139     ignore_off, ignoreH_off,  // extra copy of rbp
 140     rsp_off, rspH_off,
 141     rbx_off, rbxH_off,
 142     rdx_off, rdxH_off,
 143     rcx_off, rcxH_off,
 144     rax_off, raxH_off,
 145     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 146     align_off, alignH_off,
 147     flags_off, flagsH_off,
 148     // The frame sender code expects that rbp will be in the "natural" place and
 149     // will override any oopMap setting for it. We must therefore force the layout
 150     // so that it agrees with the frame sender code.
 151     rbp_off, rbpH_off,        // copy of rbp we will restore
 152     return_off, returnH_off,  // slot for return address
 153     reg_save_size             // size in compiler stack slots
 154   };
 155 
 156  public:
 157   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 158   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 159 
 160   // Offsets into the register save area
 161   // Used by deoptimization when it is managing result register
 162   // values on its own
 163 
 164   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 165   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 166   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 167   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 168   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 169 
 170   // During deoptimization only the result registers need to be restored,
 171   // all the other values have already been extracted.
 172   static void restore_result_registers(MacroAssembler* masm);
 173 };
 174 
 175 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 176   int off = 0;
 177   int num_xmm_regs = XMMRegister::available_xmm_registers();
 178 #if COMPILER2_OR_JVMCI
 179   if (save_wide_vectors && UseAVX == 0) {
 180     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 181   }
 182   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 183 #else
 184   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 185 #endif
 186 
 187   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 188   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 189   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 190   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 191   // CodeBlob frame size is in words.
 192   int frame_size_in_words = frame_size_in_bytes / wordSize;
 193   *total_frame_words = frame_size_in_words;
 194 
 195   // Save registers, fpu state, and flags.
 196   // We assume caller has already pushed the return address onto the
 197   // stack, so rsp is 8-byte aligned here.
 198   // We push rpb twice in this sequence because we want the real rbp
 199   // to be under the return like a normal enter.
 200 
 201   __ enter();          // rsp becomes 16-byte aligned here
 202   __ push_CPU_state(); // Push a multiple of 16 bytes
 203 
 204   // push cpu state handles this on EVEX enabled targets
 205   if (save_wide_vectors) {
 206     // Save upper half of YMM registers(0..15)
 207     int base_addr = XSAVE_AREA_YMM_BEGIN;
 208     for (int n = 0; n < 16; n++) {
 209       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 210     }
 211     if (VM_Version::supports_evex()) {
 212       // Save upper half of ZMM registers(0..15)
 213       base_addr = XSAVE_AREA_ZMM_BEGIN;
 214       for (int n = 0; n < 16; n++) {
 215         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 216       }
 217       // Save full ZMM registers(16..num_xmm_regs)
 218       base_addr = XSAVE_AREA_UPPERBANK;
 219       off = 0;
 220       int vector_len = Assembler::AVX_512bit;
 221       for (int n = 16; n < num_xmm_regs; n++) {
 222         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 223       }
 224 #if COMPILER2_OR_JVMCI
 225       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 226       off = 0;
 227       for(int n = 0; n < KRegister::number_of_registers; n++) {
 228         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 229       }
 230 #endif
 231     }
 232   } else {
 233     if (VM_Version::supports_evex()) {
 234       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 235       int base_addr = XSAVE_AREA_UPPERBANK;
 236       off = 0;
 237       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 238       for (int n = 16; n < num_xmm_regs; n++) {
 239         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 240       }
 241 #if COMPILER2_OR_JVMCI
 242       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 243       off = 0;
 244       for(int n = 0; n < KRegister::number_of_registers; n++) {
 245         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 246       }
 247 #endif
 248     }
 249   }
 250   __ vzeroupper();
 251   if (frame::arg_reg_save_area_bytes != 0) {
 252     // Allocate argument register save area
 253     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 254   }
 255 
 256   // Set an oopmap for the call site.  This oopmap will map all
 257   // oop-registers and debug-info registers as callee-saved.  This
 258   // will allow deoptimization at this safepoint to find all possible
 259   // debug-info recordings, as well as let GC find all oops.
 260 
 261   OopMapSet *oop_maps = new OopMapSet();
 262   OopMap* map = new OopMap(frame_size_in_slots, 0);
 263 
 264 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 265 
 266   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 267   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 270   // rbp location is known implicitly by the frame sender code, needs no oopmap
 271   // and the location where rbp was saved by is ignored
 272   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 273   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 282   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 283   // on EVEX enabled targets, we get it included in the xsave area
 284   off = xmm0_off;
 285   int delta = xmm1_off - off;
 286   for (int n = 0; n < 16; n++) {
 287     XMMRegister xmm_name = as_XMMRegister(n);
 288     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 289     off += delta;
 290   }
 291   if (UseAVX > 2) {
 292     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 293     off = zmm16_off;
 294     delta = zmm17_off - off;
 295     for (int n = 16; n < num_xmm_regs; n++) {
 296       XMMRegister zmm_name = as_XMMRegister(n);
 297       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 298       off += delta;
 299     }
 300   }
 301 
 302 #if COMPILER2_OR_JVMCI
 303   if (save_wide_vectors) {
 304     // Save upper half of YMM registers(0..15)
 305     off = ymm0_off;
 306     delta = ymm1_off - ymm0_off;
 307     for (int n = 0; n < 16; n++) {
 308       XMMRegister ymm_name = as_XMMRegister(n);
 309       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 310       off += delta;
 311     }
 312     if (VM_Version::supports_evex()) {
 313       // Save upper half of ZMM registers(0..15)
 314       off = zmm0_off;
 315       delta = zmm1_off - zmm0_off;
 316       for (int n = 0; n < 16; n++) {
 317         XMMRegister zmm_name = as_XMMRegister(n);
 318         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 319         off += delta;
 320       }
 321     }
 322   }
 323 #endif // COMPILER2_OR_JVMCI
 324 
 325   // %%% These should all be a waste but we'll keep things as they were for now
 326   if (true) {
 327     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 328     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 331     // rbp location is known implicitly by the frame sender code, needs no oopmap
 332     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 342     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 343     // on EVEX enabled targets, we get it included in the xsave area
 344     off = xmm0H_off;
 345     delta = xmm1H_off - off;
 346     for (int n = 0; n < 16; n++) {
 347       XMMRegister xmm_name = as_XMMRegister(n);
 348       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 349       off += delta;
 350     }
 351     if (UseAVX > 2) {
 352       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 353       off = zmm16H_off;
 354       delta = zmm17H_off - off;
 355       for (int n = 16; n < num_xmm_regs; n++) {
 356         XMMRegister zmm_name = as_XMMRegister(n);
 357         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 358         off += delta;
 359       }
 360     }
 361   }
 362 
 363   return map;
 364 }
 365 
 366 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 367   int num_xmm_regs = XMMRegister::available_xmm_registers();
 368   if (frame::arg_reg_save_area_bytes != 0) {
 369     // Pop arg register save area
 370     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 371   }
 372 
 373 #if COMPILER2_OR_JVMCI
 374   if (restore_wide_vectors) {
 375     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 376     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 377   }
 378 #else
 379   assert(!restore_wide_vectors, "vectors are generated only by C2");
 380 #endif
 381 
 382   __ vzeroupper();
 383 
 384   // On EVEX enabled targets everything is handled in pop fpu state
 385   if (restore_wide_vectors) {
 386     // Restore upper half of YMM registers (0..15)
 387     int base_addr = XSAVE_AREA_YMM_BEGIN;
 388     for (int n = 0; n < 16; n++) {
 389       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 390     }
 391     if (VM_Version::supports_evex()) {
 392       // Restore upper half of ZMM registers (0..15)
 393       base_addr = XSAVE_AREA_ZMM_BEGIN;
 394       for (int n = 0; n < 16; n++) {
 395         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 396       }
 397       // Restore full ZMM registers(16..num_xmm_regs)
 398       base_addr = XSAVE_AREA_UPPERBANK;
 399       int vector_len = Assembler::AVX_512bit;
 400       int off = 0;
 401       for (int n = 16; n < num_xmm_regs; n++) {
 402         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 403       }
 404 #if COMPILER2_OR_JVMCI
 405       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 406       off = 0;
 407       for (int n = 0; n < KRegister::number_of_registers; n++) {
 408         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 409       }
 410 #endif
 411     }
 412   } else {
 413     if (VM_Version::supports_evex()) {
 414       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 415       int base_addr = XSAVE_AREA_UPPERBANK;
 416       int off = 0;
 417       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 418       for (int n = 16; n < num_xmm_regs; n++) {
 419         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 420       }
 421 #if COMPILER2_OR_JVMCI
 422       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 423       off = 0;
 424       for (int n = 0; n < KRegister::number_of_registers; n++) {
 425         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 426       }
 427 #endif
 428     }
 429   }
 430 
 431   // Recover CPU state
 432   __ pop_CPU_state();
 433   // Get the rbp described implicitly by the calling convention (no oopMap)
 434   __ pop(rbp);
 435 }
 436 
 437 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 438 
 439   // Just restore result register. Only used by deoptimization. By
 440   // now any callee save register that needs to be restored to a c2
 441   // caller of the deoptee has been extracted into the vframeArray
 442   // and will be stuffed into the c2i adapter we create for later
 443   // restoration so only result registers need to be restored here.
 444 
 445   // Restore fp result register
 446   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 447   // Restore integer result register
 448   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 449   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 450 
 451   // Pop all of the register save are off the stack except the return address
 452   __ addptr(rsp, return_offset_in_bytes());
 453 }
 454 
 455 // Is vector's size (in bytes) bigger than a size saved by default?
 456 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 457 bool SharedRuntime::is_wide_vector(int size) {
 458   return size > 16;
 459 }
 460 
 461 // ---------------------------------------------------------------------------
 462 // Read the array of BasicTypes from a signature, and compute where the
 463 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 464 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 465 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 466 // as framesizes are fixed.
 467 // VMRegImpl::stack0 refers to the first slot 0(sp).
 468 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 469 // Register up to Register::number_of_registers are the 64-bit
 470 // integer registers.
 471 
 472 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 473 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 474 // units regardless of build. Of course for i486 there is no 64 bit build
 475 
 476 // The Java calling convention is a "shifted" version of the C ABI.
 477 // By skipping the first C ABI register we can call non-static jni methods
 478 // with small numbers of arguments without having to shuffle the arguments
 479 // at all. Since we control the java ABI we ought to at least get some
 480 // advantage out of it.
 481 
 482 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 483                                            VMRegPair *regs,
 484                                            int total_args_passed) {
 485 
 486   // Create the mapping between argument positions and
 487   // registers.
 488   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 489     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 490   };
 491   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 492     j_farg0, j_farg1, j_farg2, j_farg3,
 493     j_farg4, j_farg5, j_farg6, j_farg7
 494   };
 495 
 496 
 497   uint int_args = 0;
 498   uint fp_args = 0;
 499   uint stk_args = 0;
 500 
 501   for (int i = 0; i < total_args_passed; i++) {
 502     switch (sig_bt[i]) {
 503     case T_BOOLEAN:
 504     case T_CHAR:
 505     case T_BYTE:
 506     case T_SHORT:
 507     case T_INT:
 508       if (int_args < Argument::n_int_register_parameters_j) {
 509         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 510       } else {
 511         stk_args = align_up(stk_args, 2);
 512         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 513         stk_args += 1;
 514       }
 515       break;
 516     case T_VOID:
 517       // halves of T_LONG or T_DOUBLE
 518       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 519       regs[i].set_bad();
 520       break;
 521     case T_LONG:
 522       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 523       // fall through
 524     case T_OBJECT:
 525     case T_ARRAY:
 526     case T_ADDRESS:
 527       if (int_args < Argument::n_int_register_parameters_j) {
 528         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 529       } else {
 530         stk_args = align_up(stk_args, 2);
 531         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 532         stk_args += 2;
 533       }
 534       break;
 535     case T_FLOAT:
 536       if (fp_args < Argument::n_float_register_parameters_j) {
 537         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 538       } else {
 539         stk_args = align_up(stk_args, 2);
 540         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 541         stk_args += 1;
 542       }
 543       break;
 544     case T_DOUBLE:
 545       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 546       if (fp_args < Argument::n_float_register_parameters_j) {
 547         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 548       } else {
 549         stk_args = align_up(stk_args, 2);
 550         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 551         stk_args += 2;
 552       }
 553       break;
 554     default:
 555       ShouldNotReachHere();
 556       break;
 557     }
 558   }
 559 
 560   return stk_args;
 561 }
 562 
 563 // Patch the callers callsite with entry to compiled code if it exists.
 564 static void patch_callers_callsite(MacroAssembler *masm) {
 565   Label L;
 566   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 567   __ jcc(Assembler::equal, L);
 568 
 569   // Save the current stack pointer
 570   __ mov(r13, rsp);
 571   // Schedule the branch target address early.
 572   // Call into the VM to patch the caller, then jump to compiled callee
 573   // rax isn't live so capture return address while we easily can
 574   __ movptr(rax, Address(rsp, 0));
 575 
 576   // align stack so push_CPU_state doesn't fault
 577   __ andptr(rsp, -(StackAlignmentInBytes));
 578   __ push_CPU_state();
 579   __ vzeroupper();
 580   // VM needs caller's callsite
 581   // VM needs target method
 582   // This needs to be a long call since we will relocate this adapter to
 583   // the codeBuffer and it may not reach
 584 
 585   // Allocate argument register save area
 586   if (frame::arg_reg_save_area_bytes != 0) {
 587     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 588   }
 589   __ mov(c_rarg0, rbx);
 590   __ mov(c_rarg1, rax);
 591   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 592 
 593   // De-allocate argument register save area
 594   if (frame::arg_reg_save_area_bytes != 0) {
 595     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 596   }
 597 
 598   __ vzeroupper();
 599   __ pop_CPU_state();
 600   // restore sp
 601   __ mov(rsp, r13);
 602   __ bind(L);
 603 }
 604 
 605 
 606 static void gen_c2i_adapter(MacroAssembler *masm,
 607                             int total_args_passed,
 608                             int comp_args_on_stack,
 609                             const BasicType *sig_bt,
 610                             const VMRegPair *regs,
 611                             Label& skip_fixup) {
 612   // Before we get into the guts of the C2I adapter, see if we should be here
 613   // at all.  We've come from compiled code and are attempting to jump to the
 614   // interpreter, which means the caller made a static call to get here
 615   // (vcalls always get a compiled target if there is one).  Check for a
 616   // compiled target.  If there is one, we need to patch the caller's call.
 617   patch_callers_callsite(masm);
 618 
 619   __ bind(skip_fixup);
 620 
 621   // Since all args are passed on the stack, total_args_passed *
 622   // Interpreter::stackElementSize is the space we need.
 623 
 624   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 625 
 626   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 627 
 628   // stack is aligned, keep it that way
 629   // This is not currently needed or enforced by the interpreter, but
 630   // we might as well conform to the ABI.
 631   extraspace = align_up(extraspace, 2*wordSize);
 632 
 633   // set senderSP value
 634   __ lea(r13, Address(rsp, wordSize));
 635 
 636 #ifdef ASSERT
 637   __ check_stack_alignment(r13, "sender stack not aligned");
 638 #endif
 639   if (extraspace > 0) {
 640     // Pop the return address
 641     __ pop(rax);
 642 
 643     __ subptr(rsp, extraspace);
 644 
 645     // Push the return address
 646     __ push(rax);
 647 
 648     // Account for the return address location since we store it first rather
 649     // than hold it in a register across all the shuffling
 650     extraspace += wordSize;
 651   }
 652 
 653 #ifdef ASSERT
 654   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 655 #endif
 656 
 657   // Now write the args into the outgoing interpreter space
 658   for (int i = 0; i < total_args_passed; i++) {
 659     if (sig_bt[i] == T_VOID) {
 660       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 661       continue;
 662     }
 663 
 664     // offset to start parameters
 665     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 666     int next_off = st_off - Interpreter::stackElementSize;
 667 
 668     // Say 4 args:
 669     // i   st_off
 670     // 0   32 T_LONG
 671     // 1   24 T_VOID
 672     // 2   16 T_OBJECT
 673     // 3    8 T_BOOL
 674     // -    0 return address
 675     //
 676     // However to make thing extra confusing. Because we can fit a long/double in
 677     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 678     // leaves one slot empty and only stores to a single slot. In this case the
 679     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 680 
 681     VMReg r_1 = regs[i].first();
 682     VMReg r_2 = regs[i].second();
 683     if (!r_1->is_valid()) {
 684       assert(!r_2->is_valid(), "");
 685       continue;
 686     }
 687     if (r_1->is_stack()) {
 688       // memory to memory use rax
 689       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 690       if (!r_2->is_valid()) {
 691         // sign extend??
 692         __ movl(rax, Address(rsp, ld_off));
 693         __ movptr(Address(rsp, st_off), rax);
 694 
 695       } else {
 696 
 697         __ movq(rax, Address(rsp, ld_off));
 698 
 699         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 700         // T_DOUBLE and T_LONG use two slots in the interpreter
 701         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 702           // ld_off == LSW, ld_off+wordSize == MSW
 703           // st_off == MSW, next_off == LSW
 704           __ movq(Address(rsp, next_off), rax);
 705 #ifdef ASSERT
 706           // Overwrite the unused slot with known junk
 707           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 708           __ movptr(Address(rsp, st_off), rax);
 709 #endif /* ASSERT */
 710         } else {
 711           __ movq(Address(rsp, st_off), rax);
 712         }
 713       }
 714     } else if (r_1->is_Register()) {
 715       Register r = r_1->as_Register();
 716       if (!r_2->is_valid()) {
 717         // must be only an int (or less ) so move only 32bits to slot
 718         // why not sign extend??
 719         __ movl(Address(rsp, st_off), r);
 720       } else {
 721         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 722         // T_DOUBLE and T_LONG use two slots in the interpreter
 723         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 724           // long/double in gpr
 725 #ifdef ASSERT
 726           // Overwrite the unused slot with known junk
 727           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 728           __ movptr(Address(rsp, st_off), rax);
 729 #endif /* ASSERT */
 730           __ movq(Address(rsp, next_off), r);
 731         } else {
 732           __ movptr(Address(rsp, st_off), r);
 733         }
 734       }
 735     } else {
 736       assert(r_1->is_XMMRegister(), "");
 737       if (!r_2->is_valid()) {
 738         // only a float use just part of the slot
 739         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 740       } else {
 741 #ifdef ASSERT
 742         // Overwrite the unused slot with known junk
 743         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 744         __ movptr(Address(rsp, st_off), rax);
 745 #endif /* ASSERT */
 746         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 747       }
 748     }
 749   }
 750 
 751   // Schedule the branch target address early.
 752   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 753   __ jmp(rcx);
 754 }
 755 
 756 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 757                         address code_start, address code_end,
 758                         Label& L_ok) {
 759   Label L_fail;
 760   __ lea(temp_reg, ExternalAddress(code_start));
 761   __ cmpptr(pc_reg, temp_reg);
 762   __ jcc(Assembler::belowEqual, L_fail);
 763   __ lea(temp_reg, ExternalAddress(code_end));
 764   __ cmpptr(pc_reg, temp_reg);
 765   __ jcc(Assembler::below, L_ok);
 766   __ bind(L_fail);
 767 }
 768 
 769 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 770                                     int total_args_passed,
 771                                     int comp_args_on_stack,
 772                                     const BasicType *sig_bt,
 773                                     const VMRegPair *regs) {
 774 
 775   // Note: r13 contains the senderSP on entry. We must preserve it since
 776   // we may do a i2c -> c2i transition if we lose a race where compiled
 777   // code goes non-entrant while we get args ready.
 778   // In addition we use r13 to locate all the interpreter args as
 779   // we must align the stack to 16 bytes on an i2c entry else we
 780   // lose alignment we expect in all compiled code and register
 781   // save code can segv when fxsave instructions find improperly
 782   // aligned stack pointer.
 783 
 784   // Adapters can be frameless because they do not require the caller
 785   // to perform additional cleanup work, such as correcting the stack pointer.
 786   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 787   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 788   // even if a callee has modified the stack pointer.
 789   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 790   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 791   // up via the senderSP register).
 792   // In other words, if *either* the caller or callee is interpreted, we can
 793   // get the stack pointer repaired after a call.
 794   // This is why c2i and i2c adapters cannot be indefinitely composed.
 795   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 796   // both caller and callee would be compiled methods, and neither would
 797   // clean up the stack pointer changes performed by the two adapters.
 798   // If this happens, control eventually transfers back to the compiled
 799   // caller, but with an uncorrected stack, causing delayed havoc.
 800 
 801   if (VerifyAdapterCalls &&
 802       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 803     // So, let's test for cascading c2i/i2c adapters right now.
 804     //  assert(Interpreter::contains($return_addr) ||
 805     //         StubRoutines::contains($return_addr),
 806     //         "i2c adapter must return to an interpreter frame");
 807     __ block_comment("verify_i2c { ");
 808     // Pick up the return address
 809     __ movptr(rax, Address(rsp, 0));
 810     Label L_ok;
 811     if (Interpreter::code() != nullptr) {
 812       range_check(masm, rax, r11,
 813                   Interpreter::code()->code_start(),
 814                   Interpreter::code()->code_end(),
 815                   L_ok);
 816     }
 817     if (StubRoutines::initial_stubs_code() != nullptr) {
 818       range_check(masm, rax, r11,
 819                   StubRoutines::initial_stubs_code()->code_begin(),
 820                   StubRoutines::initial_stubs_code()->code_end(),
 821                   L_ok);
 822     }
 823     if (StubRoutines::final_stubs_code() != nullptr) {
 824       range_check(masm, rax, r11,
 825                   StubRoutines::final_stubs_code()->code_begin(),
 826                   StubRoutines::final_stubs_code()->code_end(),
 827                   L_ok);
 828     }
 829     const char* msg = "i2c adapter must return to an interpreter frame";
 830     __ block_comment(msg);
 831     __ stop(msg);
 832     __ bind(L_ok);
 833     __ block_comment("} verify_i2ce ");
 834   }
 835 
 836   // Must preserve original SP for loading incoming arguments because
 837   // we need to align the outgoing SP for compiled code.
 838   __ movptr(r11, rsp);
 839 
 840   // Pick up the return address
 841   __ pop(rax);
 842 
 843   // Convert 4-byte c2 stack slots to words.
 844   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 845 
 846   if (comp_args_on_stack) {
 847     __ subptr(rsp, comp_words_on_stack * wordSize);
 848   }
 849 
 850   // Ensure compiled code always sees stack at proper alignment
 851   __ andptr(rsp, -16);
 852 
 853   // push the return address and misalign the stack that youngest frame always sees
 854   // as far as the placement of the call instruction
 855   __ push(rax);
 856 
 857   // Put saved SP in another register
 858   const Register saved_sp = rax;
 859   __ movptr(saved_sp, r11);
 860 
 861   // Will jump to the compiled code just as if compiled code was doing it.
 862   // Pre-load the register-jump target early, to schedule it better.
 863   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 864 
 865 #if INCLUDE_JVMCI
 866   if (EnableJVMCI) {
 867     // check if this call should be routed towards a specific entry point
 868     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 869     Label no_alternative_target;
 870     __ jcc(Assembler::equal, no_alternative_target);
 871     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 872     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 873     __ bind(no_alternative_target);
 874   }
 875 #endif // INCLUDE_JVMCI
 876 
 877   // Now generate the shuffle code.  Pick up all register args and move the
 878   // rest through the floating point stack top.
 879   for (int i = 0; i < total_args_passed; i++) {
 880     if (sig_bt[i] == T_VOID) {
 881       // Longs and doubles are passed in native word order, but misaligned
 882       // in the 32-bit build.
 883       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 884       continue;
 885     }
 886 
 887     // Pick up 0, 1 or 2 words from SP+offset.
 888 
 889     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 890             "scrambled load targets?");
 891     // Load in argument order going down.
 892     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 893     // Point to interpreter value (vs. tag)
 894     int next_off = ld_off - Interpreter::stackElementSize;
 895     //
 896     //
 897     //
 898     VMReg r_1 = regs[i].first();
 899     VMReg r_2 = regs[i].second();
 900     if (!r_1->is_valid()) {
 901       assert(!r_2->is_valid(), "");
 902       continue;
 903     }
 904     if (r_1->is_stack()) {
 905       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 906       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 907 
 908       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 909       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 910       // will be generated.
 911       if (!r_2->is_valid()) {
 912         // sign extend???
 913         __ movl(r13, Address(saved_sp, ld_off));
 914         __ movptr(Address(rsp, st_off), r13);
 915       } else {
 916         //
 917         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 918         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 919         // So we must adjust where to pick up the data to match the interpreter.
 920         //
 921         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 922         // are accessed as negative so LSW is at LOW address
 923 
 924         // ld_off is MSW so get LSW
 925         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 926                            next_off : ld_off;
 927         __ movq(r13, Address(saved_sp, offset));
 928         // st_off is LSW (i.e. reg.first())
 929         __ movq(Address(rsp, st_off), r13);
 930       }
 931     } else if (r_1->is_Register()) {  // Register argument
 932       Register r = r_1->as_Register();
 933       assert(r != rax, "must be different");
 934       if (r_2->is_valid()) {
 935         //
 936         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 937         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 938         // So we must adjust where to pick up the data to match the interpreter.
 939 
 940         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 941                            next_off : ld_off;
 942 
 943         // this can be a misaligned move
 944         __ movq(r, Address(saved_sp, offset));
 945       } else {
 946         // sign extend and use a full word?
 947         __ movl(r, Address(saved_sp, ld_off));
 948       }
 949     } else {
 950       if (!r_2->is_valid()) {
 951         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 952       } else {
 953         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 954       }
 955     }
 956   }
 957 
 958   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 959 
 960   // 6243940 We might end up in handle_wrong_method if
 961   // the callee is deoptimized as we race thru here. If that
 962   // happens we don't want to take a safepoint because the
 963   // caller frame will look interpreted and arguments are now
 964   // "compiled" so it is much better to make this transition
 965   // invisible to the stack walking code. Unfortunately if
 966   // we try and find the callee by normal means a safepoint
 967   // is possible. So we stash the desired callee in the thread
 968   // and the vm will find there should this case occur.
 969 
 970   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 971 
 972   // put Method* where a c2i would expect should we end up there
 973   // only needed because eof c2 resolve stubs return Method* as a result in
 974   // rax
 975   __ mov(rax, rbx);
 976   __ jmp(r11);
 977 }
 978 
 979 // ---------------------------------------------------------------
 980 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 981                                                             int total_args_passed,
 982                                                             int comp_args_on_stack,
 983                                                             const BasicType *sig_bt,
 984                                                             const VMRegPair *regs,
 985                                                             AdapterFingerPrint* fingerprint) {
 986   address i2c_entry = __ pc();
 987 
 988   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 989 
 990   // -------------------------------------------------------------------------
 991   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 992   // to the interpreter.  The args start out packed in the compiled layout.  They
 993   // need to be unpacked into the interpreter layout.  This will almost always
 994   // require some stack space.  We grow the current (compiled) stack, then repack
 995   // the args.  We  finally end in a jump to the generic interpreter entry point.
 996   // On exit from the interpreter, the interpreter will restore our SP (lest the
 997   // compiled code, which relies solely on SP and not RBP, get sick).
 998 
 999   address c2i_unverified_entry = __ pc();
1000   Label skip_fixup;
1001 
1002   Register data = rax;
1003   Register receiver = j_rarg0;
1004   Register temp = rbx;
1005 
1006   {
1007     __ ic_check(1 /* end_alignment */);
1008     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1009     // Method might have been compiled since the call site was patched to
1010     // interpreted if that is the case treat it as a miss so we can get
1011     // the call site corrected.
1012     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1013     __ jcc(Assembler::equal, skip_fixup);
1014     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1015   }
1016 
1017   address c2i_entry = __ pc();
1018 
1019   // Class initialization barrier for static methods
1020   address c2i_no_clinit_check_entry = nullptr;
1021   if (VM_Version::supports_fast_class_init_checks()) {
1022     Label L_skip_barrier;
1023     Register method = rbx;
1024 
1025     { // Bypass the barrier for non-static methods
1026       Register flags = rscratch1;
1027       __ movl(flags, Address(method, Method::access_flags_offset()));
1028       __ testl(flags, JVM_ACC_STATIC);
1029       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1030     }
1031 
1032     Register klass = rscratch1;
1033     __ load_method_holder(klass, method);
1034     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1035 
1036     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1037 
1038     __ bind(L_skip_barrier);
1039     c2i_no_clinit_check_entry = __ pc();
1040   }
1041 
1042   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1043   bs->c2i_entry_barrier(masm);
1044 
1045   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1046 
1047   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1048 }
1049 
1050 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1051                                          VMRegPair *regs,
1052                                          int total_args_passed) {
1053 
1054 // We return the amount of VMRegImpl stack slots we need to reserve for all
1055 // the arguments NOT counting out_preserve_stack_slots.
1056 
1057 // NOTE: These arrays will have to change when c1 is ported
1058 #ifdef _WIN64
1059     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1060       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1061     };
1062     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1063       c_farg0, c_farg1, c_farg2, c_farg3
1064     };
1065 #else
1066     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1067       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1068     };
1069     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1070       c_farg0, c_farg1, c_farg2, c_farg3,
1071       c_farg4, c_farg5, c_farg6, c_farg7
1072     };
1073 #endif // _WIN64
1074 
1075 
1076     uint int_args = 0;
1077     uint fp_args = 0;
1078     uint stk_args = 0; // inc by 2 each time
1079 
1080     for (int i = 0; i < total_args_passed; i++) {
1081       switch (sig_bt[i]) {
1082       case T_BOOLEAN:
1083       case T_CHAR:
1084       case T_BYTE:
1085       case T_SHORT:
1086       case T_INT:
1087         if (int_args < Argument::n_int_register_parameters_c) {
1088           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1089 #ifdef _WIN64
1090           fp_args++;
1091           // Allocate slots for callee to stuff register args the stack.
1092           stk_args += 2;
1093 #endif
1094         } else {
1095           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1096           stk_args += 2;
1097         }
1098         break;
1099       case T_LONG:
1100         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1101         // fall through
1102       case T_OBJECT:
1103       case T_ARRAY:
1104       case T_ADDRESS:
1105       case T_METADATA:
1106         if (int_args < Argument::n_int_register_parameters_c) {
1107           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1108 #ifdef _WIN64
1109           fp_args++;
1110           stk_args += 2;
1111 #endif
1112         } else {
1113           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1114           stk_args += 2;
1115         }
1116         break;
1117       case T_FLOAT:
1118         if (fp_args < Argument::n_float_register_parameters_c) {
1119           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1120 #ifdef _WIN64
1121           int_args++;
1122           // Allocate slots for callee to stuff register args the stack.
1123           stk_args += 2;
1124 #endif
1125         } else {
1126           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1127           stk_args += 2;
1128         }
1129         break;
1130       case T_DOUBLE:
1131         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1132         if (fp_args < Argument::n_float_register_parameters_c) {
1133           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1134 #ifdef _WIN64
1135           int_args++;
1136           // Allocate slots for callee to stuff register args the stack.
1137           stk_args += 2;
1138 #endif
1139         } else {
1140           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1141           stk_args += 2;
1142         }
1143         break;
1144       case T_VOID: // Halves of longs and doubles
1145         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1146         regs[i].set_bad();
1147         break;
1148       default:
1149         ShouldNotReachHere();
1150         break;
1151       }
1152     }
1153 #ifdef _WIN64
1154   // windows abi requires that we always allocate enough stack space
1155   // for 4 64bit registers to be stored down.
1156   if (stk_args < 8) {
1157     stk_args = 8;
1158   }
1159 #endif // _WIN64
1160 
1161   return stk_args;
1162 }
1163 
1164 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1165                                              uint num_bits,
1166                                              uint total_args_passed) {
1167   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1168          "only certain vector sizes are supported for now");
1169 
1170   static const XMMRegister VEC_ArgReg[32] = {
1171      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1172      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1173     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1174     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1175   };
1176 
1177   uint stk_args = 0;
1178   uint fp_args = 0;
1179 
1180   for (uint i = 0; i < total_args_passed; i++) {
1181     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1182     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1183     regs[i].set_pair(vmreg->next(next_val), vmreg);
1184   }
1185 
1186   return stk_args;
1187 }
1188 
1189 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1190   // We always ignore the frame_slots arg and just use the space just below frame pointer
1191   // which by this time is free to use
1192   switch (ret_type) {
1193   case T_FLOAT:
1194     __ movflt(Address(rbp, -wordSize), xmm0);
1195     break;
1196   case T_DOUBLE:
1197     __ movdbl(Address(rbp, -wordSize), xmm0);
1198     break;
1199   case T_VOID:  break;
1200   default: {
1201     __ movptr(Address(rbp, -wordSize), rax);
1202     }
1203   }
1204 }
1205 
1206 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1207   // We always ignore the frame_slots arg and just use the space just below frame pointer
1208   // which by this time is free to use
1209   switch (ret_type) {
1210   case T_FLOAT:
1211     __ movflt(xmm0, Address(rbp, -wordSize));
1212     break;
1213   case T_DOUBLE:
1214     __ movdbl(xmm0, Address(rbp, -wordSize));
1215     break;
1216   case T_VOID:  break;
1217   default: {
1218     __ movptr(rax, Address(rbp, -wordSize));
1219     }
1220   }
1221 }
1222 
1223 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1224     for ( int i = first_arg ; i < arg_count ; i++ ) {
1225       if (args[i].first()->is_Register()) {
1226         __ push(args[i].first()->as_Register());
1227       } else if (args[i].first()->is_XMMRegister()) {
1228         __ subptr(rsp, 2*wordSize);
1229         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1230       }
1231     }
1232 }
1233 
1234 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1235     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1236       if (args[i].first()->is_Register()) {
1237         __ pop(args[i].first()->as_Register());
1238       } else if (args[i].first()->is_XMMRegister()) {
1239         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1240         __ addptr(rsp, 2*wordSize);
1241       }
1242     }
1243 }
1244 
1245 static void verify_oop_args(MacroAssembler* masm,
1246                             const methodHandle& method,
1247                             const BasicType* sig_bt,
1248                             const VMRegPair* regs) {
1249   Register temp_reg = rbx;  // not part of any compiled calling seq
1250   if (VerifyOops) {
1251     for (int i = 0; i < method->size_of_parameters(); i++) {
1252       if (is_reference_type(sig_bt[i])) {
1253         VMReg r = regs[i].first();
1254         assert(r->is_valid(), "bad oop arg");
1255         if (r->is_stack()) {
1256           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1257           __ verify_oop(temp_reg);
1258         } else {
1259           __ verify_oop(r->as_Register());
1260         }
1261       }
1262     }
1263   }
1264 }
1265 
1266 static void check_continuation_enter_argument(VMReg actual_vmreg,
1267                                               Register expected_reg,
1268                                               const char* name) {
1269   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1270   assert(actual_vmreg->as_Register() == expected_reg,
1271          "%s is in unexpected register: %s instead of %s",
1272          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1273 }
1274 
1275 
1276 //---------------------------- continuation_enter_setup ---------------------------
1277 //
1278 // Arguments:
1279 //   None.
1280 //
1281 // Results:
1282 //   rsp: pointer to blank ContinuationEntry
1283 //
1284 // Kills:
1285 //   rax
1286 //
1287 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1288   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1289   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1290   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1291 
1292   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1293   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1294 
1295   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1296   OopMap* map = new OopMap(frame_size, 0);
1297 
1298   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1299   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1300   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1301 
1302   return map;
1303 }
1304 
1305 //---------------------------- fill_continuation_entry ---------------------------
1306 //
1307 // Arguments:
1308 //   rsp: pointer to blank Continuation entry
1309 //   reg_cont_obj: pointer to the continuation
1310 //   reg_flags: flags
1311 //
1312 // Results:
1313 //   rsp: pointer to filled out ContinuationEntry
1314 //
1315 // Kills:
1316 //   rax
1317 //
1318 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1319   assert_different_registers(rax, reg_cont_obj, reg_flags);
1320 #ifdef ASSERT
1321   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1322 #endif
1323   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1324   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1325   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1326   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1327   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1328 
1329   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1330   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1331   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1332   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1333 
1334   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1335   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1336 }
1337 
1338 //---------------------------- continuation_enter_cleanup ---------------------------
1339 //
1340 // Arguments:
1341 //   rsp: pointer to the ContinuationEntry
1342 //
1343 // Results:
1344 //   rsp: pointer to the spilled rbp in the entry frame
1345 //
1346 // Kills:
1347 //   rbx
1348 //
1349 void static continuation_enter_cleanup(MacroAssembler* masm) {
1350 #ifdef ASSERT
1351   Label L_good_sp;
1352   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1353   __ jcc(Assembler::equal, L_good_sp);
1354   __ stop("Incorrect rsp at continuation_enter_cleanup");
1355   __ bind(L_good_sp);
1356 #endif
1357 
1358   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1359   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1360   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1361   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1362 
1363   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1364   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1365   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1366 }
1367 
1368 static void gen_continuation_enter(MacroAssembler* masm,
1369                                    const VMRegPair* regs,
1370                                    int& exception_offset,
1371                                    OopMapSet* oop_maps,
1372                                    int& frame_complete,
1373                                    int& stack_slots,
1374                                    int& interpreted_entry_offset,
1375                                    int& compiled_entry_offset) {
1376 
1377   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1378   int pos_cont_obj   = 0;
1379   int pos_is_cont    = 1;
1380   int pos_is_virtual = 2;
1381 
1382   // The platform-specific calling convention may present the arguments in various registers.
1383   // To simplify the rest of the code, we expect the arguments to reside at these known
1384   // registers, and we additionally check the placement here in case calling convention ever
1385   // changes.
1386   Register reg_cont_obj   = c_rarg1;
1387   Register reg_is_cont    = c_rarg2;
1388   Register reg_is_virtual = c_rarg3;
1389 
1390   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1391   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1392   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1393 
1394   // Utility methods kill rax, make sure there are no collisions
1395   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1396 
1397   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1398                          relocInfo::static_call_type);
1399 
1400   address start = __ pc();
1401 
1402   Label L_thaw, L_exit;
1403 
1404   // i2i entry used at interp_only_mode only
1405   interpreted_entry_offset = __ pc() - start;
1406   {
1407 #ifdef ASSERT
1408     Label is_interp_only;
1409     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1410     __ jcc(Assembler::notEqual, is_interp_only);
1411     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1412     __ bind(is_interp_only);
1413 #endif
1414 
1415     __ pop(rax); // return address
1416     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1417     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1418     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1419     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1420     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1421     __ push(rax); // return address
1422     __ push_cont_fastpath();
1423 
1424     __ enter();
1425 
1426     stack_slots = 2; // will be adjusted in setup
1427     OopMap* map = continuation_enter_setup(masm, stack_slots);
1428     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1429     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1430 
1431     __ verify_oop(reg_cont_obj);
1432 
1433     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1434 
1435     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1436     __ testptr(reg_is_cont, reg_is_cont);
1437     __ jcc(Assembler::notZero, L_thaw);
1438 
1439     // --- Resolve path
1440 
1441     // Make sure the call is patchable
1442     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1443     // Emit stub for static call
1444     CodeBuffer* cbuf = masm->code_section()->outer();
1445     address stub = CompiledDirectCall::emit_to_interp_stub(*cbuf, __ pc());
1446     if (stub == nullptr) {
1447       fatal("CodeCache is full at gen_continuation_enter");
1448     }
1449     __ call(resolve);
1450     oop_maps->add_gc_map(__ pc() - start, map);
1451     __ post_call_nop();
1452 
1453     __ jmp(L_exit);
1454   }
1455 
1456   // compiled entry
1457   __ align(CodeEntryAlignment);
1458   compiled_entry_offset = __ pc() - start;
1459   __ enter();
1460 
1461   stack_slots = 2; // will be adjusted in setup
1462   OopMap* map = continuation_enter_setup(masm, stack_slots);
1463 
1464   // Frame is now completed as far as size and linkage.
1465   frame_complete = __ pc() - start;
1466 
1467   __ verify_oop(reg_cont_obj);
1468 
1469   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1470 
1471   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1472   __ testptr(reg_is_cont, reg_is_cont);
1473   __ jccb(Assembler::notZero, L_thaw);
1474 
1475   // --- call Continuation.enter(Continuation c, boolean isContinue)
1476 
1477   // Make sure the call is patchable
1478   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1479 
1480   // Emit stub for static call
1481   CodeBuffer* cbuf = masm->code_section()->outer();
1482   address stub = CompiledDirectCall::emit_to_interp_stub(*cbuf, __ pc());
1483   if (stub == nullptr) {
1484     fatal("CodeCache is full at gen_continuation_enter");
1485   }
1486 
1487   // The call needs to be resolved. There's a special case for this in
1488   // SharedRuntime::find_callee_info_helper() which calls
1489   // LinkResolver::resolve_continuation_enter() which resolves the call to
1490   // Continuation.enter(Continuation c, boolean isContinue).
1491   __ call(resolve);
1492 
1493   oop_maps->add_gc_map(__ pc() - start, map);
1494   __ post_call_nop();
1495 
1496   __ jmpb(L_exit);
1497 
1498   // --- Thawing path
1499 
1500   __ bind(L_thaw);
1501 
1502   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1503 
1504   ContinuationEntry::_return_pc_offset = __ pc() - start;
1505   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1506   __ post_call_nop();
1507 
1508   // --- Normal exit (resolve/thawing)
1509 
1510   __ bind(L_exit);
1511 
1512   continuation_enter_cleanup(masm);
1513   __ pop(rbp);
1514   __ ret(0);
1515 
1516   // --- Exception handling path
1517 
1518   exception_offset = __ pc() - start;
1519 
1520   continuation_enter_cleanup(masm);
1521   __ pop(rbp);
1522 
1523   __ movptr(c_rarg0, r15_thread);
1524   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1525 
1526   // rax still holds the original exception oop, save it before the call
1527   __ push(rax);
1528 
1529   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1530   __ movptr(rbx, rax);
1531 
1532   // Continue at exception handler:
1533   //   rax: exception oop
1534   //   rbx: exception handler
1535   //   rdx: exception pc
1536   __ pop(rax);
1537   __ verify_oop(rax);
1538   __ pop(rdx);
1539   __ jmp(rbx);
1540 }
1541 
1542 static void gen_continuation_yield(MacroAssembler* masm,
1543                                    const VMRegPair* regs,
1544                                    OopMapSet* oop_maps,
1545                                    int& frame_complete,
1546                                    int& stack_slots,
1547                                    int& compiled_entry_offset) {
1548   enum layout {
1549     rbp_off,
1550     rbpH_off,
1551     return_off,
1552     return_off2,
1553     framesize // inclusive of return address
1554   };
1555   stack_slots = framesize /  VMRegImpl::slots_per_word;
1556   assert(stack_slots == 2, "recheck layout");
1557 
1558   address start = __ pc();
1559   compiled_entry_offset = __ pc() - start;
1560   __ enter();
1561   address the_pc = __ pc();
1562 
1563   frame_complete = the_pc - start;
1564 
1565   // This nop must be exactly at the PC we push into the frame info.
1566   // We use this nop for fast CodeBlob lookup, associate the OopMap
1567   // with it right away.
1568   __ post_call_nop();
1569   OopMap* map = new OopMap(framesize, 1);
1570   oop_maps->add_gc_map(frame_complete, map);
1571 
1572   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1573   __ movptr(c_rarg0, r15_thread);
1574   __ movptr(c_rarg1, rsp);
1575   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1576   __ reset_last_Java_frame(true);
1577 
1578   Label L_pinned;
1579 
1580   __ testptr(rax, rax);
1581   __ jcc(Assembler::notZero, L_pinned);
1582 
1583   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1584   continuation_enter_cleanup(masm);
1585   __ pop(rbp);
1586   __ ret(0);
1587 
1588   __ bind(L_pinned);
1589 
1590   // Pinned, return to caller
1591 
1592   // handle pending exception thrown by freeze
1593   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1594   Label ok;
1595   __ jcc(Assembler::equal, ok);
1596   __ leave();
1597   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1598   __ bind(ok);
1599 
1600   __ leave();
1601   __ ret(0);
1602 }
1603 
1604 static void gen_special_dispatch(MacroAssembler* masm,
1605                                  const methodHandle& method,
1606                                  const BasicType* sig_bt,
1607                                  const VMRegPair* regs) {
1608   verify_oop_args(masm, method, sig_bt, regs);
1609   vmIntrinsics::ID iid = method->intrinsic_id();
1610 
1611   // Now write the args into the outgoing interpreter space
1612   bool     has_receiver   = false;
1613   Register receiver_reg   = noreg;
1614   int      member_arg_pos = -1;
1615   Register member_reg     = noreg;
1616   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1617   if (ref_kind != 0) {
1618     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1619     member_reg = rbx;  // known to be free at this point
1620     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1621   } else if (iid == vmIntrinsics::_invokeBasic) {
1622     has_receiver = true;
1623   } else if (iid == vmIntrinsics::_linkToNative) {
1624     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1625     member_reg = rbx;  // known to be free at this point
1626   } else {
1627     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1628   }
1629 
1630   if (member_reg != noreg) {
1631     // Load the member_arg into register, if necessary.
1632     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1633     VMReg r = regs[member_arg_pos].first();
1634     if (r->is_stack()) {
1635       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1636     } else {
1637       // no data motion is needed
1638       member_reg = r->as_Register();
1639     }
1640   }
1641 
1642   if (has_receiver) {
1643     // Make sure the receiver is loaded into a register.
1644     assert(method->size_of_parameters() > 0, "oob");
1645     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1646     VMReg r = regs[0].first();
1647     assert(r->is_valid(), "bad receiver arg");
1648     if (r->is_stack()) {
1649       // Porting note:  This assumes that compiled calling conventions always
1650       // pass the receiver oop in a register.  If this is not true on some
1651       // platform, pick a temp and load the receiver from stack.
1652       fatal("receiver always in a register");
1653       receiver_reg = j_rarg0;  // known to be free at this point
1654       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1655     } else {
1656       // no data motion is needed
1657       receiver_reg = r->as_Register();
1658     }
1659   }
1660 
1661   // Figure out which address we are really jumping to:
1662   MethodHandles::generate_method_handle_dispatch(masm, iid,
1663                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1664 }
1665 
1666 // ---------------------------------------------------------------------------
1667 // Generate a native wrapper for a given method.  The method takes arguments
1668 // in the Java compiled code convention, marshals them to the native
1669 // convention (handlizes oops, etc), transitions to native, makes the call,
1670 // returns to java state (possibly blocking), unhandlizes any result and
1671 // returns.
1672 //
1673 // Critical native functions are a shorthand for the use of
1674 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1675 // functions.  The wrapper is expected to unpack the arguments before
1676 // passing them to the callee. Critical native functions leave the state _in_Java,
1677 // since they cannot stop for GC.
1678 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1679 // block and the check for pending exceptions it's impossible for them
1680 // to be thrown.
1681 //
1682 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1683                                                 const methodHandle& method,
1684                                                 int compile_id,
1685                                                 BasicType* in_sig_bt,
1686                                                 VMRegPair* in_regs,
1687                                                 BasicType ret_type) {
1688   if (method->is_continuation_native_intrinsic()) {
1689     int exception_offset = -1;
1690     OopMapSet* oop_maps = new OopMapSet();
1691     int frame_complete = -1;
1692     int stack_slots = -1;
1693     int interpreted_entry_offset = -1;
1694     int vep_offset = -1;
1695     if (method->is_continuation_enter_intrinsic()) {
1696       gen_continuation_enter(masm,
1697                              in_regs,
1698                              exception_offset,
1699                              oop_maps,
1700                              frame_complete,
1701                              stack_slots,
1702                              interpreted_entry_offset,
1703                              vep_offset);
1704     } else if (method->is_continuation_yield_intrinsic()) {
1705       gen_continuation_yield(masm,
1706                              in_regs,
1707                              oop_maps,
1708                              frame_complete,
1709                              stack_slots,
1710                              vep_offset);
1711     } else {
1712       guarantee(false, "Unknown Continuation native intrinsic");
1713     }
1714 
1715 #ifdef ASSERT
1716     if (method->is_continuation_enter_intrinsic()) {
1717       assert(interpreted_entry_offset != -1, "Must be set");
1718       assert(exception_offset != -1,         "Must be set");
1719     } else {
1720       assert(interpreted_entry_offset == -1, "Must be unset");
1721       assert(exception_offset == -1,         "Must be unset");
1722     }
1723     assert(frame_complete != -1,    "Must be set");
1724     assert(stack_slots != -1,       "Must be set");
1725     assert(vep_offset != -1,        "Must be set");
1726 #endif
1727 
1728     __ flush();
1729     nmethod* nm = nmethod::new_native_nmethod(method,
1730                                               compile_id,
1731                                               masm->code(),
1732                                               vep_offset,
1733                                               frame_complete,
1734                                               stack_slots,
1735                                               in_ByteSize(-1),
1736                                               in_ByteSize(-1),
1737                                               oop_maps,
1738                                               exception_offset);
1739     if (nm == nullptr) return nm;
1740     if (method->is_continuation_enter_intrinsic()) {
1741       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1742     } else if (method->is_continuation_yield_intrinsic()) {
1743       _cont_doYield_stub = nm;
1744     }
1745     return nm;
1746   }
1747 
1748   if (method->is_method_handle_intrinsic()) {
1749     vmIntrinsics::ID iid = method->intrinsic_id();
1750     intptr_t start = (intptr_t)__ pc();
1751     int vep_offset = ((intptr_t)__ pc()) - start;
1752     gen_special_dispatch(masm,
1753                          method,
1754                          in_sig_bt,
1755                          in_regs);
1756     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1757     __ flush();
1758     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1759     return nmethod::new_native_nmethod(method,
1760                                        compile_id,
1761                                        masm->code(),
1762                                        vep_offset,
1763                                        frame_complete,
1764                                        stack_slots / VMRegImpl::slots_per_word,
1765                                        in_ByteSize(-1),
1766                                        in_ByteSize(-1),
1767                                        nullptr);
1768   }
1769   address native_func = method->native_function();
1770   assert(native_func != nullptr, "must have function");
1771 
1772   // An OopMap for lock (and class if static)
1773   OopMapSet *oop_maps = new OopMapSet();
1774   intptr_t start = (intptr_t)__ pc();
1775 
1776   // We have received a description of where all the java arg are located
1777   // on entry to the wrapper. We need to convert these args to where
1778   // the jni function will expect them. To figure out where they go
1779   // we convert the java signature to a C signature by inserting
1780   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1781 
1782   const int total_in_args = method->size_of_parameters();
1783   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1784 
1785   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1786   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1787   BasicType* in_elem_bt = nullptr;
1788 
1789   int argc = 0;
1790   out_sig_bt[argc++] = T_ADDRESS;
1791   if (method->is_static()) {
1792     out_sig_bt[argc++] = T_OBJECT;
1793   }
1794 
1795   for (int i = 0; i < total_in_args ; i++ ) {
1796     out_sig_bt[argc++] = in_sig_bt[i];
1797   }
1798 
1799   // Now figure out where the args must be stored and how much stack space
1800   // they require.
1801   int out_arg_slots;
1802   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1803 
1804   // Compute framesize for the wrapper.  We need to handlize all oops in
1805   // incoming registers
1806 
1807   // Calculate the total number of stack slots we will need.
1808 
1809   // First count the abi requirement plus all of the outgoing args
1810   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1811 
1812   // Now the space for the inbound oop handle area
1813   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1814 
1815   int oop_handle_offset = stack_slots;
1816   stack_slots += total_save_slots;
1817 
1818   // Now any space we need for handlizing a klass if static method
1819 
1820   int klass_slot_offset = 0;
1821   int klass_offset = -1;
1822   int lock_slot_offset = 0;
1823   bool is_static = false;
1824 
1825   if (method->is_static()) {
1826     klass_slot_offset = stack_slots;
1827     stack_slots += VMRegImpl::slots_per_word;
1828     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1829     is_static = true;
1830   }
1831 
1832   // Plus a lock if needed
1833 
1834   if (method->is_synchronized()) {
1835     lock_slot_offset = stack_slots;
1836     stack_slots += VMRegImpl::slots_per_word;
1837   }
1838 
1839   // Now a place (+2) to save return values or temp during shuffling
1840   // + 4 for return address (which we own) and saved rbp
1841   stack_slots += 6;
1842 
1843   // Ok The space we have allocated will look like:
1844   //
1845   //
1846   // FP-> |                     |
1847   //      |---------------------|
1848   //      | 2 slots for moves   |
1849   //      |---------------------|
1850   //      | lock box (if sync)  |
1851   //      |---------------------| <- lock_slot_offset
1852   //      | klass (if static)   |
1853   //      |---------------------| <- klass_slot_offset
1854   //      | oopHandle area      |
1855   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1856   //      | outbound memory     |
1857   //      | based arguments     |
1858   //      |                     |
1859   //      |---------------------|
1860   //      |                     |
1861   // SP-> | out_preserved_slots |
1862   //
1863   //
1864 
1865 
1866   // Now compute actual number of stack words we need rounding to make
1867   // stack properly aligned.
1868   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1869 
1870   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1871 
1872   // First thing make an ic check to see if we should even be here
1873 
1874   // We are free to use all registers as temps without saving them and
1875   // restoring them except rbp. rbp is the only callee save register
1876   // as far as the interpreter and the compiler(s) are concerned.
1877 
1878   const Register receiver = j_rarg0;
1879 
1880   Label exception_pending;
1881 
1882   assert_different_registers(receiver, rscratch1, rscratch2);
1883   __ verify_oop(receiver);
1884   __ ic_check(8 /* end_alignment */);
1885 
1886   int vep_offset = ((intptr_t)__ pc()) - start;
1887 
1888   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1889     Label L_skip_barrier;
1890     Register klass = r10;
1891     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1892     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1893 
1894     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1895 
1896     __ bind(L_skip_barrier);
1897   }
1898 
1899 #ifdef COMPILER1
1900   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1901   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1902     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1903   }
1904 #endif // COMPILER1
1905 
1906   // The instruction at the verified entry point must be 5 bytes or longer
1907   // because it can be patched on the fly by make_non_entrant. The stack bang
1908   // instruction fits that requirement.
1909 
1910   // Generate stack overflow check
1911   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1912 
1913   // Generate a new frame for the wrapper.
1914   __ enter();
1915   // -2 because return address is already present and so is saved rbp
1916   __ subptr(rsp, stack_size - 2*wordSize);
1917 
1918   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1919   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1920   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1921 
1922   // Frame is now completed as far as size and linkage.
1923   int frame_complete = ((intptr_t)__ pc()) - start;
1924 
1925     if (UseRTMLocking) {
1926       // Abort RTM transaction before calling JNI
1927       // because critical section will be large and will be
1928       // aborted anyway. Also nmethod could be deoptimized.
1929       __ xabort(0);
1930     }
1931 
1932 #ifdef ASSERT
1933   __ check_stack_alignment(rsp, "improperly aligned stack");
1934 #endif /* ASSERT */
1935 
1936 
1937   // We use r14 as the oop handle for the receiver/klass
1938   // It is callee save so it survives the call to native
1939 
1940   const Register oop_handle_reg = r14;
1941 
1942   //
1943   // We immediately shuffle the arguments so that any vm call we have to
1944   // make from here on out (sync slow path, jvmti, etc.) we will have
1945   // captured the oops from our caller and have a valid oopMap for
1946   // them.
1947 
1948   // -----------------
1949   // The Grand Shuffle
1950 
1951   // The Java calling convention is either equal (linux) or denser (win64) than the
1952   // c calling convention. However the because of the jni_env argument the c calling
1953   // convention always has at least one more (and two for static) arguments than Java.
1954   // Therefore if we move the args from java -> c backwards then we will never have
1955   // a register->register conflict and we don't have to build a dependency graph
1956   // and figure out how to break any cycles.
1957   //
1958 
1959   // Record esp-based slot for receiver on stack for non-static methods
1960   int receiver_offset = -1;
1961 
1962   // This is a trick. We double the stack slots so we can claim
1963   // the oops in the caller's frame. Since we are sure to have
1964   // more args than the caller doubling is enough to make
1965   // sure we can capture all the incoming oop args from the
1966   // caller.
1967   //
1968   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1969 
1970   // Mark location of rbp (someday)
1971   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1972 
1973   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1974   // All inbound args are referenced based on rbp and all outbound args via rsp.
1975 
1976 
1977 #ifdef ASSERT
1978   bool reg_destroyed[Register::number_of_registers];
1979   bool freg_destroyed[XMMRegister::number_of_registers];
1980   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1981     reg_destroyed[r] = false;
1982   }
1983   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1984     freg_destroyed[f] = false;
1985   }
1986 
1987 #endif /* ASSERT */
1988 
1989   // For JNI natives the incoming and outgoing registers are offset upwards.
1990   GrowableArray<int> arg_order(2 * total_in_args);
1991 
1992   VMRegPair tmp_vmreg;
1993   tmp_vmreg.set2(rbx->as_VMReg());
1994 
1995   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1996     arg_order.push(i);
1997     arg_order.push(c_arg);
1998   }
1999 
2000   int temploc = -1;
2001   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2002     int i = arg_order.at(ai);
2003     int c_arg = arg_order.at(ai + 1);
2004     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2005 #ifdef ASSERT
2006     if (in_regs[i].first()->is_Register()) {
2007       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2008     } else if (in_regs[i].first()->is_XMMRegister()) {
2009       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2010     }
2011     if (out_regs[c_arg].first()->is_Register()) {
2012       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2013     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2014       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2015     }
2016 #endif /* ASSERT */
2017     switch (in_sig_bt[i]) {
2018       case T_ARRAY:
2019       case T_OBJECT:
2020         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2021                     ((i == 0) && (!is_static)),
2022                     &receiver_offset);
2023         break;
2024       case T_VOID:
2025         break;
2026 
2027       case T_FLOAT:
2028         __ float_move(in_regs[i], out_regs[c_arg]);
2029           break;
2030 
2031       case T_DOUBLE:
2032         assert( i + 1 < total_in_args &&
2033                 in_sig_bt[i + 1] == T_VOID &&
2034                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2035         __ double_move(in_regs[i], out_regs[c_arg]);
2036         break;
2037 
2038       case T_LONG :
2039         __ long_move(in_regs[i], out_regs[c_arg]);
2040         break;
2041 
2042       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2043 
2044       default:
2045         __ move32_64(in_regs[i], out_regs[c_arg]);
2046     }
2047   }
2048 
2049   int c_arg;
2050 
2051   // Pre-load a static method's oop into r14.  Used both by locking code and
2052   // the normal JNI call code.
2053   // point c_arg at the first arg that is already loaded in case we
2054   // need to spill before we call out
2055   c_arg = total_c_args - total_in_args;
2056 
2057   if (method->is_static()) {
2058 
2059     //  load oop into a register
2060     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2061 
2062     // Now handlize the static class mirror it's known not-null.
2063     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2064     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2065 
2066     // Now get the handle
2067     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2068     // store the klass handle as second argument
2069     __ movptr(c_rarg1, oop_handle_reg);
2070     // and protect the arg if we must spill
2071     c_arg--;
2072   }
2073 
2074   // Change state to native (we save the return address in the thread, since it might not
2075   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2076   // points into the right code segment. It does not have to be the correct return pc.
2077   // We use the same pc/oopMap repeatedly when we call out
2078 
2079   intptr_t the_pc = (intptr_t) __ pc();
2080   oop_maps->add_gc_map(the_pc - start, map);
2081 
2082   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2083 
2084 
2085   // We have all of the arguments setup at this point. We must not touch any register
2086   // argument registers at this point (what if we save/restore them there are no oop?
2087 
2088   {
2089     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2090     // protect the args we've loaded
2091     save_args(masm, total_c_args, c_arg, out_regs);
2092     __ mov_metadata(c_rarg1, method());
2093     __ call_VM_leaf(
2094       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2095       r15_thread, c_rarg1);
2096     restore_args(masm, total_c_args, c_arg, out_regs);
2097   }
2098 
2099   // RedefineClasses() tracing support for obsolete method entry
2100   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2101     // protect the args we've loaded
2102     save_args(masm, total_c_args, c_arg, out_regs);
2103     __ mov_metadata(c_rarg1, method());
2104     __ call_VM_leaf(
2105       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2106       r15_thread, c_rarg1);
2107     restore_args(masm, total_c_args, c_arg, out_regs);
2108   }
2109 
2110   // Lock a synchronized method
2111 
2112   // Register definitions used by locking and unlocking
2113 
2114   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2115   const Register obj_reg  = rbx;  // Will contain the oop
2116   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2117   const Register old_hdr  = r13;  // value of old header at unlock time
2118 
2119   Label slow_path_lock;
2120   Label lock_done;
2121 
2122   if (method->is_synchronized()) {
2123     Label count_mon;
2124 
2125     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2126 
2127     // Get the handle (the 2nd argument)
2128     __ mov(oop_handle_reg, c_rarg1);
2129 
2130     // Get address of the box
2131 
2132     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2133 
2134     // Load the oop from the handle
2135     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2136 
2137     if (LockingMode == LM_MONITOR) {
2138       __ jmp(slow_path_lock);
2139     } else if (LockingMode == LM_LEGACY) {
2140       // Load immediate 1 into swap_reg %rax
2141       __ movl(swap_reg, 1);
2142 
2143       // Load (object->mark() | 1) into swap_reg %rax
2144       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2145 
2146       // Save (object->mark() | 1) into BasicLock's displaced header
2147       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2148 
2149       // src -> dest iff dest == rax else rax <- dest
2150       __ lock();
2151       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2152       __ jcc(Assembler::equal, count_mon);
2153 
2154       // Hmm should this move to the slow path code area???
2155 
2156       // Test if the oopMark is an obvious stack pointer, i.e.,
2157       //  1) (mark & 3) == 0, and
2158       //  2) rsp <= mark < mark + os::pagesize()
2159       // These 3 tests can be done by evaluating the following
2160       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2161       // assuming both stack pointer and pagesize have their
2162       // least significant 2 bits clear.
2163       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2164 
2165       __ subptr(swap_reg, rsp);
2166       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2167 
2168       // Save the test result, for recursive case, the result is zero
2169       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2170       __ jcc(Assembler::notEqual, slow_path_lock);
2171     } else {
2172       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2173       __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2174     }
2175     __ bind(count_mon);
2176     __ inc_held_monitor_count();
2177 
2178     // Slow path will re-enter here
2179     __ bind(lock_done);
2180   }
2181 
2182   // Finally just about ready to make the JNI call
2183 
2184   // get JNIEnv* which is first argument to native
2185   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2186 
2187   // Now set thread in native
2188   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2189 
2190   __ call(RuntimeAddress(native_func));
2191 
2192   // Verify or restore cpu control state after JNI call
2193   __ restore_cpu_control_state_after_jni(rscratch1);
2194 
2195   // Unpack native results.
2196   switch (ret_type) {
2197   case T_BOOLEAN: __ c2bool(rax);            break;
2198   case T_CHAR   : __ movzwl(rax, rax);      break;
2199   case T_BYTE   : __ sign_extend_byte (rax); break;
2200   case T_SHORT  : __ sign_extend_short(rax); break;
2201   case T_INT    : /* nothing to do */        break;
2202   case T_DOUBLE :
2203   case T_FLOAT  :
2204     // Result is in xmm0 we'll save as needed
2205     break;
2206   case T_ARRAY:                 // Really a handle
2207   case T_OBJECT:                // Really a handle
2208       break; // can't de-handlize until after safepoint check
2209   case T_VOID: break;
2210   case T_LONG: break;
2211   default       : ShouldNotReachHere();
2212   }
2213 
2214   Label after_transition;
2215 
2216   // Switch thread to "native transition" state before reading the synchronization state.
2217   // This additional state is necessary because reading and testing the synchronization
2218   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2219   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2220   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2221   //     Thread A is resumed to finish this native method, but doesn't block here since it
2222   //     didn't see any synchronization is progress, and escapes.
2223   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2224 
2225   // Force this write out before the read below
2226   if (!UseSystemMemoryBarrier) {
2227     __ membar(Assembler::Membar_mask_bits(
2228               Assembler::LoadLoad | Assembler::LoadStore |
2229               Assembler::StoreLoad | Assembler::StoreStore));
2230   }
2231 
2232   // check for safepoint operation in progress and/or pending suspend requests
2233   {
2234     Label Continue;
2235     Label slow_path;
2236 
2237     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2238 
2239     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2240     __ jcc(Assembler::equal, Continue);
2241     __ bind(slow_path);
2242 
2243     // Don't use call_VM as it will see a possible pending exception and forward it
2244     // and never return here preventing us from clearing _last_native_pc down below.
2245     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2246     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2247     // by hand.
2248     //
2249     __ vzeroupper();
2250     save_native_result(masm, ret_type, stack_slots);
2251     __ mov(c_rarg0, r15_thread);
2252     __ mov(r12, rsp); // remember sp
2253     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2254     __ andptr(rsp, -16); // align stack as required by ABI
2255     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2256     __ mov(rsp, r12); // restore sp
2257     __ reinit_heapbase();
2258     // Restore any method result value
2259     restore_native_result(masm, ret_type, stack_slots);
2260     __ bind(Continue);
2261   }
2262 
2263   // change thread state
2264   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2265   __ bind(after_transition);
2266 
2267   Label reguard;
2268   Label reguard_done;
2269   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2270   __ jcc(Assembler::equal, reguard);
2271   __ bind(reguard_done);
2272 
2273   // native result if any is live
2274 
2275   // Unlock
2276   Label slow_path_unlock;
2277   Label unlock_done;
2278   if (method->is_synchronized()) {
2279 
2280     Label fast_done;
2281 
2282     // Get locked oop from the handle we passed to jni
2283     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2284 
2285     if (LockingMode == LM_LEGACY) {
2286       Label not_recur;
2287       // Simple recursive lock?
2288       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2289       __ jcc(Assembler::notEqual, not_recur);
2290       __ dec_held_monitor_count();
2291       __ jmpb(fast_done);
2292       __ bind(not_recur);
2293     }
2294 
2295     // Must save rax if it is live now because cmpxchg must use it
2296     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2297       save_native_result(masm, ret_type, stack_slots);
2298     }
2299 
2300     if (LockingMode == LM_MONITOR) {
2301       __ jmp(slow_path_unlock);
2302     } else if (LockingMode == LM_LEGACY) {
2303       // get address of the stack lock
2304       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2305       //  get old displaced header
2306       __ movptr(old_hdr, Address(rax, 0));
2307 
2308       // Atomic swap old header if oop still contains the stack lock
2309       __ lock();
2310       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2311       __ jcc(Assembler::notEqual, slow_path_unlock);
2312       __ dec_held_monitor_count();
2313     } else {
2314       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2315       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2316       __ dec_held_monitor_count();
2317     }
2318 
2319     // slow path re-enters here
2320     __ bind(unlock_done);
2321     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2322       restore_native_result(masm, ret_type, stack_slots);
2323     }
2324 
2325     __ bind(fast_done);
2326   }
2327   {
2328     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2329     save_native_result(masm, ret_type, stack_slots);
2330     __ mov_metadata(c_rarg1, method());
2331     __ call_VM_leaf(
2332          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2333          r15_thread, c_rarg1);
2334     restore_native_result(masm, ret_type, stack_slots);
2335   }
2336 
2337   __ reset_last_Java_frame(false);
2338 
2339   // Unbox oop result, e.g. JNIHandles::resolve value.
2340   if (is_reference_type(ret_type)) {
2341     __ resolve_jobject(rax /* value */,
2342                        r15_thread /* thread */,
2343                        rcx /* tmp */);
2344   }
2345 
2346   if (CheckJNICalls) {
2347     // clear_pending_jni_exception_check
2348     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2349   }
2350 
2351   // reset handle block
2352   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2353   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2354 
2355   // pop our frame
2356 
2357   __ leave();
2358 
2359   // Any exception pending?
2360   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2361   __ jcc(Assembler::notEqual, exception_pending);
2362 
2363   // Return
2364 
2365   __ ret(0);
2366 
2367   // Unexpected paths are out of line and go here
2368 
2369   // forward the exception
2370   __ bind(exception_pending);
2371 
2372   // and forward the exception
2373   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2374 
2375   // Slow path locking & unlocking
2376   if (method->is_synchronized()) {
2377 
2378     // BEGIN Slow path lock
2379     __ bind(slow_path_lock);
2380 
2381     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2382     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2383 
2384     // protect the args we've loaded
2385     save_args(masm, total_c_args, c_arg, out_regs);
2386 
2387     __ mov(c_rarg0, obj_reg);
2388     __ mov(c_rarg1, lock_reg);
2389     __ mov(c_rarg2, r15_thread);
2390 
2391     // Not a leaf but we have last_Java_frame setup as we want
2392     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2393     restore_args(masm, total_c_args, c_arg, out_regs);
2394 
2395 #ifdef ASSERT
2396     { Label L;
2397     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2398     __ jcc(Assembler::equal, L);
2399     __ stop("no pending exception allowed on exit from monitorenter");
2400     __ bind(L);
2401     }
2402 #endif
2403     __ jmp(lock_done);
2404 
2405     // END Slow path lock
2406 
2407     // BEGIN Slow path unlock
2408     __ bind(slow_path_unlock);
2409 
2410     // If we haven't already saved the native result we must save it now as xmm registers
2411     // are still exposed.
2412     __ vzeroupper();
2413     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2414       save_native_result(masm, ret_type, stack_slots);
2415     }
2416 
2417     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2418 
2419     __ mov(c_rarg0, obj_reg);
2420     __ mov(c_rarg2, r15_thread);
2421     __ mov(r12, rsp); // remember sp
2422     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2423     __ andptr(rsp, -16); // align stack as required by ABI
2424 
2425     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2426     // NOTE that obj_reg == rbx currently
2427     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2428     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2429 
2430     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2431     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2432     __ mov(rsp, r12); // restore sp
2433     __ reinit_heapbase();
2434 #ifdef ASSERT
2435     {
2436       Label L;
2437       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2438       __ jcc(Assembler::equal, L);
2439       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2440       __ bind(L);
2441     }
2442 #endif /* ASSERT */
2443 
2444     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2445 
2446     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2447       restore_native_result(masm, ret_type, stack_slots);
2448     }
2449     __ jmp(unlock_done);
2450 
2451     // END Slow path unlock
2452 
2453   } // synchronized
2454 
2455   // SLOW PATH Reguard the stack if needed
2456 
2457   __ bind(reguard);
2458   __ vzeroupper();
2459   save_native_result(masm, ret_type, stack_slots);
2460   __ mov(r12, rsp); // remember sp
2461   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2462   __ andptr(rsp, -16); // align stack as required by ABI
2463   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2464   __ mov(rsp, r12); // restore sp
2465   __ reinit_heapbase();
2466   restore_native_result(masm, ret_type, stack_slots);
2467   // and continue
2468   __ jmp(reguard_done);
2469 
2470 
2471 
2472   __ flush();
2473 
2474   nmethod *nm = nmethod::new_native_nmethod(method,
2475                                             compile_id,
2476                                             masm->code(),
2477                                             vep_offset,
2478                                             frame_complete,
2479                                             stack_slots / VMRegImpl::slots_per_word,
2480                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2481                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2482                                             oop_maps);
2483 
2484   return nm;
2485 }
2486 
2487 // this function returns the adjust size (in number of words) to a c2i adapter
2488 // activation for use during deoptimization
2489 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2490   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2491 }
2492 
2493 
2494 uint SharedRuntime::out_preserve_stack_slots() {
2495   return 0;
2496 }
2497 
2498 
2499 // Number of stack slots between incoming argument block and the start of
2500 // a new frame.  The PROLOG must add this many slots to the stack.  The
2501 // EPILOG must remove this many slots.  amd64 needs two slots for
2502 // return address.
2503 uint SharedRuntime::in_preserve_stack_slots() {
2504   return 4 + 2 * VerifyStackAtCalls;
2505 }
2506 
2507 //------------------------------generate_deopt_blob----------------------------
2508 void SharedRuntime::generate_deopt_blob() {
2509   // Allocate space for the code
2510   ResourceMark rm;
2511   // Setup code generation tools
2512   int pad = 0;
2513   if (UseAVX > 2) {
2514     pad += 1024;
2515   }
2516 #if INCLUDE_JVMCI
2517   if (EnableJVMCI) {
2518     pad += 512; // Increase the buffer size when compiling for JVMCI
2519   }
2520 #endif
2521   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2522   MacroAssembler* masm = new MacroAssembler(&buffer);
2523   int frame_size_in_words;
2524   OopMap* map = nullptr;
2525   OopMapSet *oop_maps = new OopMapSet();
2526 
2527   // -------------
2528   // This code enters when returning to a de-optimized nmethod.  A return
2529   // address has been pushed on the stack, and return values are in
2530   // registers.
2531   // If we are doing a normal deopt then we were called from the patched
2532   // nmethod from the point we returned to the nmethod. So the return
2533   // address on the stack is wrong by NativeCall::instruction_size
2534   // We will adjust the value so it looks like we have the original return
2535   // address on the stack (like when we eagerly deoptimized).
2536   // In the case of an exception pending when deoptimizing, we enter
2537   // with a return address on the stack that points after the call we patched
2538   // into the exception handler. We have the following register state from,
2539   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2540   //    rax: exception oop
2541   //    rbx: exception handler
2542   //    rdx: throwing pc
2543   // So in this case we simply jam rdx into the useless return address and
2544   // the stack looks just like we want.
2545   //
2546   // At this point we need to de-opt.  We save the argument return
2547   // registers.  We call the first C routine, fetch_unroll_info().  This
2548   // routine captures the return values and returns a structure which
2549   // describes the current frame size and the sizes of all replacement frames.
2550   // The current frame is compiled code and may contain many inlined
2551   // functions, each with their own JVM state.  We pop the current frame, then
2552   // push all the new frames.  Then we call the C routine unpack_frames() to
2553   // populate these frames.  Finally unpack_frames() returns us the new target
2554   // address.  Notice that callee-save registers are BLOWN here; they have
2555   // already been captured in the vframeArray at the time the return PC was
2556   // patched.
2557   address start = __ pc();
2558   Label cont;
2559 
2560   // Prolog for non exception case!
2561 
2562   // Save everything in sight.
2563   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2564 
2565   // Normal deoptimization.  Save exec mode for unpack_frames.
2566   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2567   __ jmp(cont);
2568 
2569   int reexecute_offset = __ pc() - start;
2570 #if INCLUDE_JVMCI && !defined(COMPILER1)
2571   if (EnableJVMCI && UseJVMCICompiler) {
2572     // JVMCI does not use this kind of deoptimization
2573     __ should_not_reach_here();
2574   }
2575 #endif
2576 
2577   // Reexecute case
2578   // return address is the pc describes what bci to do re-execute at
2579 
2580   // No need to update map as each call to save_live_registers will produce identical oopmap
2581   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2582 
2583   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2584   __ jmp(cont);
2585 
2586 #if INCLUDE_JVMCI
2587   Label after_fetch_unroll_info_call;
2588   int implicit_exception_uncommon_trap_offset = 0;
2589   int uncommon_trap_offset = 0;
2590 
2591   if (EnableJVMCI) {
2592     implicit_exception_uncommon_trap_offset = __ pc() - start;
2593 
2594     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2595     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2596 
2597     uncommon_trap_offset = __ pc() - start;
2598 
2599     // Save everything in sight.
2600     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2601     // fetch_unroll_info needs to call last_java_frame()
2602     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2603 
2604     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2605     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2606 
2607     __ movl(r14, Deoptimization::Unpack_reexecute);
2608     __ mov(c_rarg0, r15_thread);
2609     __ movl(c_rarg2, r14); // exec mode
2610     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2611     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2612 
2613     __ reset_last_Java_frame(false);
2614 
2615     __ jmp(after_fetch_unroll_info_call);
2616   } // EnableJVMCI
2617 #endif // INCLUDE_JVMCI
2618 
2619   int exception_offset = __ pc() - start;
2620 
2621   // Prolog for exception case
2622 
2623   // all registers are dead at this entry point, except for rax, and
2624   // rdx which contain the exception oop and exception pc
2625   // respectively.  Set them in TLS and fall thru to the
2626   // unpack_with_exception_in_tls entry point.
2627 
2628   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2629   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2630 
2631   int exception_in_tls_offset = __ pc() - start;
2632 
2633   // new implementation because exception oop is now passed in JavaThread
2634 
2635   // Prolog for exception case
2636   // All registers must be preserved because they might be used by LinearScan
2637   // Exceptiop oop and throwing PC are passed in JavaThread
2638   // tos: stack at point of call to method that threw the exception (i.e. only
2639   // args are on the stack, no return address)
2640 
2641   // make room on stack for the return address
2642   // It will be patched later with the throwing pc. The correct value is not
2643   // available now because loading it from memory would destroy registers.
2644   __ push(0);
2645 
2646   // Save everything in sight.
2647   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2648 
2649   // Now it is safe to overwrite any register
2650 
2651   // Deopt during an exception.  Save exec mode for unpack_frames.
2652   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2653 
2654   // load throwing pc from JavaThread and patch it as the return address
2655   // of the current frame. Then clear the field in JavaThread
2656 
2657   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2658   __ movptr(Address(rbp, wordSize), rdx);
2659   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2660 
2661 #ifdef ASSERT
2662   // verify that there is really an exception oop in JavaThread
2663   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2664   __ verify_oop(rax);
2665 
2666   // verify that there is no pending exception
2667   Label no_pending_exception;
2668   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2669   __ testptr(rax, rax);
2670   __ jcc(Assembler::zero, no_pending_exception);
2671   __ stop("must not have pending exception here");
2672   __ bind(no_pending_exception);
2673 #endif
2674 
2675   __ bind(cont);
2676 
2677   // Call C code.  Need thread and this frame, but NOT official VM entry
2678   // crud.  We cannot block on this call, no GC can happen.
2679   //
2680   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2681 
2682   // fetch_unroll_info needs to call last_java_frame().
2683 
2684   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2685 #ifdef ASSERT
2686   { Label L;
2687     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2688     __ jcc(Assembler::equal, L);
2689     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2690     __ bind(L);
2691   }
2692 #endif // ASSERT
2693   __ mov(c_rarg0, r15_thread);
2694   __ movl(c_rarg1, r14); // exec_mode
2695   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2696 
2697   // Need to have an oopmap that tells fetch_unroll_info where to
2698   // find any register it might need.
2699   oop_maps->add_gc_map(__ pc() - start, map);
2700 
2701   __ reset_last_Java_frame(false);
2702 
2703 #if INCLUDE_JVMCI
2704   if (EnableJVMCI) {
2705     __ bind(after_fetch_unroll_info_call);
2706   }
2707 #endif
2708 
2709   // Load UnrollBlock* into rdi
2710   __ mov(rdi, rax);
2711 
2712   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2713    Label noException;
2714   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2715   __ jcc(Assembler::notEqual, noException);
2716   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2717   // QQQ this is useless it was null above
2718   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2719   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2720   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2721 
2722   __ verify_oop(rax);
2723 
2724   // Overwrite the result registers with the exception results.
2725   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2726   // I think this is useless
2727   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2728 
2729   __ bind(noException);
2730 
2731   // Only register save data is on the stack.
2732   // Now restore the result registers.  Everything else is either dead
2733   // or captured in the vframeArray.
2734   RegisterSaver::restore_result_registers(masm);
2735 
2736   // All of the register save area has been popped of the stack. Only the
2737   // return address remains.
2738 
2739   // Pop all the frames we must move/replace.
2740   //
2741   // Frame picture (youngest to oldest)
2742   // 1: self-frame (no frame link)
2743   // 2: deopting frame  (no frame link)
2744   // 3: caller of deopting frame (could be compiled/interpreted).
2745   //
2746   // Note: by leaving the return address of self-frame on the stack
2747   // and using the size of frame 2 to adjust the stack
2748   // when we are done the return to frame 3 will still be on the stack.
2749 
2750   // Pop deoptimized frame
2751   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2752   __ addptr(rsp, rcx);
2753 
2754   // rsp should be pointing at the return address to the caller (3)
2755 
2756   // Pick up the initial fp we should save
2757   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2758   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2759 
2760 #ifdef ASSERT
2761   // Compilers generate code that bang the stack by as much as the
2762   // interpreter would need. So this stack banging should never
2763   // trigger a fault. Verify that it does not on non product builds.
2764   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2765   __ bang_stack_size(rbx, rcx);
2766 #endif
2767 
2768   // Load address of array of frame pcs into rcx
2769   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2770 
2771   // Trash the old pc
2772   __ addptr(rsp, wordSize);
2773 
2774   // Load address of array of frame sizes into rsi
2775   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2776 
2777   // Load counter into rdx
2778   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2779 
2780   // Now adjust the caller's stack to make up for the extra locals
2781   // but record the original sp so that we can save it in the skeletal interpreter
2782   // frame and the stack walking of interpreter_sender will get the unextended sp
2783   // value and not the "real" sp value.
2784 
2785   const Register sender_sp = r8;
2786 
2787   __ mov(sender_sp, rsp);
2788   __ movl(rbx, Address(rdi,
2789                        Deoptimization::UnrollBlock::
2790                        caller_adjustment_offset()));
2791   __ subptr(rsp, rbx);
2792 
2793   // Push interpreter frames in a loop
2794   Label loop;
2795   __ bind(loop);
2796   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2797   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2798   __ pushptr(Address(rcx, 0));          // Save return address
2799   __ enter();                           // Save old & set new ebp
2800   __ subptr(rsp, rbx);                  // Prolog
2801   // This value is corrected by layout_activation_impl
2802   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2803   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2804   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2805   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2806   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2807   __ decrementl(rdx);                   // Decrement counter
2808   __ jcc(Assembler::notZero, loop);
2809   __ pushptr(Address(rcx, 0));          // Save final return address
2810 
2811   // Re-push self-frame
2812   __ enter();                           // Save old & set new ebp
2813 
2814   // Allocate a full sized register save area.
2815   // Return address and rbp are in place, so we allocate two less words.
2816   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2817 
2818   // Restore frame locals after moving the frame
2819   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2820   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2821 
2822   // Call C code.  Need thread but NOT official VM entry
2823   // crud.  We cannot block on this call, no GC can happen.  Call should
2824   // restore return values to their stack-slots with the new SP.
2825   //
2826   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2827 
2828   // Use rbp because the frames look interpreted now
2829   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2830   // Don't need the precise return PC here, just precise enough to point into this code blob.
2831   address the_pc = __ pc();
2832   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2833 
2834   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2835   __ mov(c_rarg0, r15_thread);
2836   __ movl(c_rarg1, r14); // second arg: exec_mode
2837   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2838   // Revert SP alignment after call since we're going to do some SP relative addressing below
2839   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2840 
2841   // Set an oopmap for the call site
2842   // Use the same PC we used for the last java frame
2843   oop_maps->add_gc_map(the_pc - start,
2844                        new OopMap( frame_size_in_words, 0 ));
2845 
2846   // Clear fp AND pc
2847   __ reset_last_Java_frame(true);
2848 
2849   // Collect return values
2850   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2851   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2852   // I think this is useless (throwing pc?)
2853   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2854 
2855   // Pop self-frame.
2856   __ leave();                           // Epilog
2857 
2858   // Jump to interpreter
2859   __ ret(0);
2860 
2861   // Make sure all code is generated
2862   masm->flush();
2863 
2864   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2865   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2866 #if INCLUDE_JVMCI
2867   if (EnableJVMCI) {
2868     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2869     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2870   }
2871 #endif
2872 }
2873 
2874 #ifdef COMPILER2
2875 //------------------------------generate_uncommon_trap_blob--------------------
2876 void SharedRuntime::generate_uncommon_trap_blob() {
2877   // Allocate space for the code
2878   ResourceMark rm;
2879   // Setup code generation tools
2880   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2881   MacroAssembler* masm = new MacroAssembler(&buffer);
2882 
2883   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2884 
2885   address start = __ pc();
2886 
2887   if (UseRTMLocking) {
2888     // Abort RTM transaction before possible nmethod deoptimization.
2889     __ xabort(0);
2890   }
2891 
2892   // Push self-frame.  We get here with a return address on the
2893   // stack, so rsp is 8-byte aligned until we allocate our frame.
2894   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2895 
2896   // No callee saved registers. rbp is assumed implicitly saved
2897   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2898 
2899   // compiler left unloaded_class_index in j_rarg0 move to where the
2900   // runtime expects it.
2901   __ movl(c_rarg1, j_rarg0);
2902 
2903   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2904 
2905   // Call C code.  Need thread but NOT official VM entry
2906   // crud.  We cannot block on this call, no GC can happen.  Call should
2907   // capture callee-saved registers as well as return values.
2908   // Thread is in rdi already.
2909   //
2910   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2911 
2912   __ mov(c_rarg0, r15_thread);
2913   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2914   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2915 
2916   // Set an oopmap for the call site
2917   OopMapSet* oop_maps = new OopMapSet();
2918   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2919 
2920   // location of rbp is known implicitly by the frame sender code
2921 
2922   oop_maps->add_gc_map(__ pc() - start, map);
2923 
2924   __ reset_last_Java_frame(false);
2925 
2926   // Load UnrollBlock* into rdi
2927   __ mov(rdi, rax);
2928 
2929 #ifdef ASSERT
2930   { Label L;
2931     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
2932               Deoptimization::Unpack_uncommon_trap);
2933     __ jcc(Assembler::equal, L);
2934     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
2935     __ bind(L);
2936   }
2937 #endif
2938 
2939   // Pop all the frames we must move/replace.
2940   //
2941   // Frame picture (youngest to oldest)
2942   // 1: self-frame (no frame link)
2943   // 2: deopting frame  (no frame link)
2944   // 3: caller of deopting frame (could be compiled/interpreted).
2945 
2946   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2947   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2948 
2949   // Pop deoptimized frame (int)
2950   __ movl(rcx, Address(rdi,
2951                        Deoptimization::UnrollBlock::
2952                        size_of_deoptimized_frame_offset()));
2953   __ addptr(rsp, rcx);
2954 
2955   // rsp should be pointing at the return address to the caller (3)
2956 
2957   // Pick up the initial fp we should save
2958   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2959   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2960 
2961 #ifdef ASSERT
2962   // Compilers generate code that bang the stack by as much as the
2963   // interpreter would need. So this stack banging should never
2964   // trigger a fault. Verify that it does not on non product builds.
2965   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2966   __ bang_stack_size(rbx, rcx);
2967 #endif
2968 
2969   // Load address of array of frame pcs into rcx (address*)
2970   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2971 
2972   // Trash the return pc
2973   __ addptr(rsp, wordSize);
2974 
2975   // Load address of array of frame sizes into rsi (intptr_t*)
2976   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
2977 
2978   // Counter
2979   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
2980 
2981   // Now adjust the caller's stack to make up for the extra locals but
2982   // record the original sp so that we can save it in the skeletal
2983   // interpreter frame and the stack walking of interpreter_sender
2984   // will get the unextended sp value and not the "real" sp value.
2985 
2986   const Register sender_sp = r8;
2987 
2988   __ mov(sender_sp, rsp);
2989   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
2990   __ subptr(rsp, rbx);
2991 
2992   // Push interpreter frames in a loop
2993   Label loop;
2994   __ bind(loop);
2995   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2996   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2997   __ pushptr(Address(rcx, 0));     // Save return address
2998   __ enter();                      // Save old & set new rbp
2999   __ subptr(rsp, rbx);             // Prolog
3000   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3001             sender_sp);            // Make it walkable
3002   // This value is corrected by layout_activation_impl
3003   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3004   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3005   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3006   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3007   __ decrementl(rdx);              // Decrement counter
3008   __ jcc(Assembler::notZero, loop);
3009   __ pushptr(Address(rcx, 0));     // Save final return address
3010 
3011   // Re-push self-frame
3012   __ enter();                 // Save old & set new rbp
3013   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3014                               // Prolog
3015 
3016   // Use rbp because the frames look interpreted now
3017   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3018   // Don't need the precise return PC here, just precise enough to point into this code blob.
3019   address the_pc = __ pc();
3020   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3021 
3022   // Call C code.  Need thread but NOT official VM entry
3023   // crud.  We cannot block on this call, no GC can happen.  Call should
3024   // restore return values to their stack-slots with the new SP.
3025   // Thread is in rdi already.
3026   //
3027   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3028 
3029   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3030   __ mov(c_rarg0, r15_thread);
3031   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3032   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3033 
3034   // Set an oopmap for the call site
3035   // Use the same PC we used for the last java frame
3036   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3037 
3038   // Clear fp AND pc
3039   __ reset_last_Java_frame(true);
3040 
3041   // Pop self-frame.
3042   __ leave();                 // Epilog
3043 
3044   // Jump to interpreter
3045   __ ret(0);
3046 
3047   // Make sure all code is generated
3048   masm->flush();
3049 
3050   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3051                                                  SimpleRuntimeFrame::framesize >> 1);
3052 }
3053 #endif // COMPILER2
3054 
3055 //------------------------------generate_handler_blob------
3056 //
3057 // Generate a special Compile2Runtime blob that saves all registers,
3058 // and setup oopmap.
3059 //
3060 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3061   assert(StubRoutines::forward_exception_entry() != nullptr,
3062          "must be generated before");
3063 
3064   ResourceMark rm;
3065   OopMapSet *oop_maps = new OopMapSet();
3066   OopMap* map;
3067 
3068   // Allocate space for the code.  Setup code generation tools.
3069   CodeBuffer buffer("handler_blob", 2048, 1024);
3070   MacroAssembler* masm = new MacroAssembler(&buffer);
3071 
3072   address start   = __ pc();
3073   address call_pc = nullptr;
3074   int frame_size_in_words;
3075   bool cause_return = (poll_type == POLL_AT_RETURN);
3076   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3077 
3078   if (UseRTMLocking) {
3079     // Abort RTM transaction before calling runtime
3080     // because critical section will be large and will be
3081     // aborted anyway. Also nmethod could be deoptimized.
3082     __ xabort(0);
3083   }
3084 
3085   // Make room for return address (or push it again)
3086   if (!cause_return) {
3087     __ push(rbx);
3088   }
3089 
3090   // Save registers, fpu state, and flags
3091   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3092 
3093   // The following is basically a call_VM.  However, we need the precise
3094   // address of the call in order to generate an oopmap. Hence, we do all the
3095   // work ourselves.
3096 
3097   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3098 
3099   // The return address must always be correct so that frame constructor never
3100   // sees an invalid pc.
3101 
3102   if (!cause_return) {
3103     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3104     // Additionally, rbx is a callee saved register and we can look at it later to determine
3105     // if someone changed the return address for us!
3106     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3107     __ movptr(Address(rbp, wordSize), rbx);
3108   }
3109 
3110   // Do the call
3111   __ mov(c_rarg0, r15_thread);
3112   __ call(RuntimeAddress(call_ptr));
3113 
3114   // Set an oopmap for the call site.  This oopmap will map all
3115   // oop-registers and debug-info registers as callee-saved.  This
3116   // will allow deoptimization at this safepoint to find all possible
3117   // debug-info recordings, as well as let GC find all oops.
3118 
3119   oop_maps->add_gc_map( __ pc() - start, map);
3120 
3121   Label noException;
3122 
3123   __ reset_last_Java_frame(false);
3124 
3125   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3126   __ jcc(Assembler::equal, noException);
3127 
3128   // Exception pending
3129 
3130   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3131 
3132   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3133 
3134   // No exception case
3135   __ bind(noException);
3136 
3137   Label no_adjust;
3138 #ifdef ASSERT
3139   Label bail;
3140 #endif
3141   if (!cause_return) {
3142     Label no_prefix, not_special;
3143 
3144     // If our stashed return pc was modified by the runtime we avoid touching it
3145     __ cmpptr(rbx, Address(rbp, wordSize));
3146     __ jccb(Assembler::notEqual, no_adjust);
3147 
3148     // Skip over the poll instruction.
3149     // See NativeInstruction::is_safepoint_poll()
3150     // Possible encodings:
3151     //      85 00       test   %eax,(%rax)
3152     //      85 01       test   %eax,(%rcx)
3153     //      85 02       test   %eax,(%rdx)
3154     //      85 03       test   %eax,(%rbx)
3155     //      85 06       test   %eax,(%rsi)
3156     //      85 07       test   %eax,(%rdi)
3157     //
3158     //   41 85 00       test   %eax,(%r8)
3159     //   41 85 01       test   %eax,(%r9)
3160     //   41 85 02       test   %eax,(%r10)
3161     //   41 85 03       test   %eax,(%r11)
3162     //   41 85 06       test   %eax,(%r14)
3163     //   41 85 07       test   %eax,(%r15)
3164     //
3165     //      85 04 24    test   %eax,(%rsp)
3166     //   41 85 04 24    test   %eax,(%r12)
3167     //      85 45 00    test   %eax,0x0(%rbp)
3168     //   41 85 45 00    test   %eax,0x0(%r13)
3169 
3170     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3171     __ jcc(Assembler::notEqual, no_prefix);
3172     __ addptr(rbx, 1);
3173     __ bind(no_prefix);
3174 #ifdef ASSERT
3175     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3176 #endif
3177     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3178     // r12/rsp 0x04
3179     // r13/rbp 0x05
3180     __ movzbq(rcx, Address(rbx, 1));
3181     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3182     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3183     __ cmpptr(rcx, 1);
3184     __ jcc(Assembler::above, not_special);
3185     __ addptr(rbx, 1);
3186     __ bind(not_special);
3187 #ifdef ASSERT
3188     // Verify the correct encoding of the poll we're about to skip.
3189     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3190     __ jcc(Assembler::notEqual, bail);
3191     // Mask out the modrm bits
3192     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3193     // rax encodes to 0, so if the bits are nonzero it's incorrect
3194     __ jcc(Assembler::notZero, bail);
3195 #endif
3196     // Adjust return pc forward to step over the safepoint poll instruction
3197     __ addptr(rbx, 2);
3198     __ movptr(Address(rbp, wordSize), rbx);
3199   }
3200 
3201   __ bind(no_adjust);
3202   // Normal exit, restore registers and exit.
3203   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3204   __ ret(0);
3205 
3206 #ifdef ASSERT
3207   __ bind(bail);
3208   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3209 #endif
3210 
3211   // Make sure all code is generated
3212   masm->flush();
3213 
3214   // Fill-out other meta info
3215   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3216 }
3217 
3218 //
3219 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3220 //
3221 // Generate a stub that calls into vm to find out the proper destination
3222 // of a java call. All the argument registers are live at this point
3223 // but since this is generic code we don't know what they are and the caller
3224 // must do any gc of the args.
3225 //
3226 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3227   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3228 
3229   // allocate space for the code
3230   ResourceMark rm;
3231 
3232   CodeBuffer buffer(name, 1200, 512);
3233   MacroAssembler* masm = new MacroAssembler(&buffer);
3234 
3235   int frame_size_in_words;
3236 
3237   OopMapSet *oop_maps = new OopMapSet();
3238   OopMap* map = nullptr;
3239 
3240   int start = __ offset();
3241 
3242   // No need to save vector registers since they are caller-saved anyway.
3243   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3244 
3245   int frame_complete = __ offset();
3246 
3247   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3248 
3249   __ mov(c_rarg0, r15_thread);
3250 
3251   __ call(RuntimeAddress(destination));
3252 
3253 
3254   // Set an oopmap for the call site.
3255   // We need this not only for callee-saved registers, but also for volatile
3256   // registers that the compiler might be keeping live across a safepoint.
3257 
3258   oop_maps->add_gc_map( __ offset() - start, map);
3259 
3260   // rax contains the address we are going to jump to assuming no exception got installed
3261 
3262   // clear last_Java_sp
3263   __ reset_last_Java_frame(false);
3264   // check for pending exceptions
3265   Label pending;
3266   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3267   __ jcc(Assembler::notEqual, pending);
3268 
3269   // get the returned Method*
3270   __ get_vm_result_2(rbx, r15_thread);
3271   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3272 
3273   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3274 
3275   RegisterSaver::restore_live_registers(masm);
3276 
3277   // We are back to the original state on entry and ready to go.
3278 
3279   __ jmp(rax);
3280 
3281   // Pending exception after the safepoint
3282 
3283   __ bind(pending);
3284 
3285   RegisterSaver::restore_live_registers(masm);
3286 
3287   // exception pending => remove activation and forward to exception handler
3288 
3289   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3290 
3291   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3292   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3293 
3294   // -------------
3295   // make sure all code is generated
3296   masm->flush();
3297 
3298   // return the  blob
3299   // frame_size_words or bytes??
3300   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3301 }
3302 
3303 //------------------------------Montgomery multiplication------------------------
3304 //
3305 
3306 #ifndef _WINDOWS
3307 
3308 // Subtract 0:b from carry:a.  Return carry.
3309 static julong
3310 sub(julong a[], julong b[], julong carry, long len) {
3311   long long i = 0, cnt = len;
3312   julong tmp;
3313   asm volatile("clc; "
3314                "0: ; "
3315                "mov (%[b], %[i], 8), %[tmp]; "
3316                "sbb %[tmp], (%[a], %[i], 8); "
3317                "inc %[i]; dec %[cnt]; "
3318                "jne 0b; "
3319                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3320                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3321                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3322                : "memory");
3323   return tmp;
3324 }
3325 
3326 // Multiply (unsigned) Long A by Long B, accumulating the double-
3327 // length result into the accumulator formed of T0, T1, and T2.
3328 #define MACC(A, B, T0, T1, T2)                                  \
3329 do {                                                            \
3330   unsigned long hi, lo;                                         \
3331   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3332            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3333            : "r"(A), "a"(B) : "cc");                            \
3334  } while(0)
3335 
3336 // As above, but add twice the double-length result into the
3337 // accumulator.
3338 #define MACC2(A, B, T0, T1, T2)                                 \
3339 do {                                                            \
3340   unsigned long hi, lo;                                         \
3341   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3342            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3343            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3344            : "r"(A), "a"(B) : "cc");                            \
3345  } while(0)
3346 
3347 #else //_WINDOWS
3348 
3349 static julong
3350 sub(julong a[], julong b[], julong carry, long len) {
3351   long i;
3352   julong tmp;
3353   unsigned char c = 1;
3354   for (i = 0; i < len; i++) {
3355     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3356     a[i] = tmp;
3357   }
3358   c = _addcarry_u64(c, carry, ~0, &tmp);
3359   return tmp;
3360 }
3361 
3362 // Multiply (unsigned) Long A by Long B, accumulating the double-
3363 // length result into the accumulator formed of T0, T1, and T2.
3364 #define MACC(A, B, T0, T1, T2)                          \
3365 do {                                                    \
3366   julong hi, lo;                            \
3367   lo = _umul128(A, B, &hi);                             \
3368   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3369   c = _addcarry_u64(c, hi, T1, &T1);                    \
3370   _addcarry_u64(c, T2, 0, &T2);                         \
3371  } while(0)
3372 
3373 // As above, but add twice the double-length result into the
3374 // accumulator.
3375 #define MACC2(A, B, T0, T1, T2)                         \
3376 do {                                                    \
3377   julong hi, lo;                            \
3378   lo = _umul128(A, B, &hi);                             \
3379   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3380   c = _addcarry_u64(c, hi, T1, &T1);                    \
3381   _addcarry_u64(c, T2, 0, &T2);                         \
3382   c = _addcarry_u64(0, lo, T0, &T0);                    \
3383   c = _addcarry_u64(c, hi, T1, &T1);                    \
3384   _addcarry_u64(c, T2, 0, &T2);                         \
3385  } while(0)
3386 
3387 #endif //_WINDOWS
3388 
3389 // Fast Montgomery multiplication.  The derivation of the algorithm is
3390 // in  A Cryptographic Library for the Motorola DSP56000,
3391 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3392 
3393 static void NOINLINE
3394 montgomery_multiply(julong a[], julong b[], julong n[],
3395                     julong m[], julong inv, int len) {
3396   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3397   int i;
3398 
3399   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3400 
3401   for (i = 0; i < len; i++) {
3402     int j;
3403     for (j = 0; j < i; j++) {
3404       MACC(a[j], b[i-j], t0, t1, t2);
3405       MACC(m[j], n[i-j], t0, t1, t2);
3406     }
3407     MACC(a[i], b[0], t0, t1, t2);
3408     m[i] = t0 * inv;
3409     MACC(m[i], n[0], t0, t1, t2);
3410 
3411     assert(t0 == 0, "broken Montgomery multiply");
3412 
3413     t0 = t1; t1 = t2; t2 = 0;
3414   }
3415 
3416   for (i = len; i < 2*len; i++) {
3417     int j;
3418     for (j = i-len+1; j < len; j++) {
3419       MACC(a[j], b[i-j], t0, t1, t2);
3420       MACC(m[j], n[i-j], t0, t1, t2);
3421     }
3422     m[i-len] = t0;
3423     t0 = t1; t1 = t2; t2 = 0;
3424   }
3425 
3426   while (t0)
3427     t0 = sub(m, n, t0, len);
3428 }
3429 
3430 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3431 // multiplies so it should be up to 25% faster than Montgomery
3432 // multiplication.  However, its loop control is more complex and it
3433 // may actually run slower on some machines.
3434 
3435 static void NOINLINE
3436 montgomery_square(julong a[], julong n[],
3437                   julong m[], julong inv, int len) {
3438   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3439   int i;
3440 
3441   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3442 
3443   for (i = 0; i < len; i++) {
3444     int j;
3445     int end = (i+1)/2;
3446     for (j = 0; j < end; j++) {
3447       MACC2(a[j], a[i-j], t0, t1, t2);
3448       MACC(m[j], n[i-j], t0, t1, t2);
3449     }
3450     if ((i & 1) == 0) {
3451       MACC(a[j], a[j], t0, t1, t2);
3452     }
3453     for (; j < i; j++) {
3454       MACC(m[j], n[i-j], t0, t1, t2);
3455     }
3456     m[i] = t0 * inv;
3457     MACC(m[i], n[0], t0, t1, t2);
3458 
3459     assert(t0 == 0, "broken Montgomery square");
3460 
3461     t0 = t1; t1 = t2; t2 = 0;
3462   }
3463 
3464   for (i = len; i < 2*len; i++) {
3465     int start = i-len+1;
3466     int end = start + (len - start)/2;
3467     int j;
3468     for (j = start; j < end; j++) {
3469       MACC2(a[j], a[i-j], t0, t1, t2);
3470       MACC(m[j], n[i-j], t0, t1, t2);
3471     }
3472     if ((i & 1) == 0) {
3473       MACC(a[j], a[j], t0, t1, t2);
3474     }
3475     for (; j < len; j++) {
3476       MACC(m[j], n[i-j], t0, t1, t2);
3477     }
3478     m[i-len] = t0;
3479     t0 = t1; t1 = t2; t2 = 0;
3480   }
3481 
3482   while (t0)
3483     t0 = sub(m, n, t0, len);
3484 }
3485 
3486 // Swap words in a longword.
3487 static julong swap(julong x) {
3488   return (x << 32) | (x >> 32);
3489 }
3490 
3491 // Copy len longwords from s to d, word-swapping as we go.  The
3492 // destination array is reversed.
3493 static void reverse_words(julong *s, julong *d, int len) {
3494   d += len;
3495   while(len-- > 0) {
3496     d--;
3497     *d = swap(*s);
3498     s++;
3499   }
3500 }
3501 
3502 // The threshold at which squaring is advantageous was determined
3503 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3504 #define MONTGOMERY_SQUARING_THRESHOLD 64
3505 
3506 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3507                                         jint len, jlong inv,
3508                                         jint *m_ints) {
3509   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3510   int longwords = len/2;
3511 
3512   // Make very sure we don't use so much space that the stack might
3513   // overflow.  512 jints corresponds to an 16384-bit integer and
3514   // will use here a total of 8k bytes of stack space.
3515   int divisor = sizeof(julong) * 4;
3516   guarantee(longwords <= 8192 / divisor, "must be");
3517   int total_allocation = longwords * sizeof (julong) * 4;
3518   julong *scratch = (julong *)alloca(total_allocation);
3519 
3520   // Local scratch arrays
3521   julong
3522     *a = scratch + 0 * longwords,
3523     *b = scratch + 1 * longwords,
3524     *n = scratch + 2 * longwords,
3525     *m = scratch + 3 * longwords;
3526 
3527   reverse_words((julong *)a_ints, a, longwords);
3528   reverse_words((julong *)b_ints, b, longwords);
3529   reverse_words((julong *)n_ints, n, longwords);
3530 
3531   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3532 
3533   reverse_words(m, (julong *)m_ints, longwords);
3534 }
3535 
3536 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3537                                       jint len, jlong inv,
3538                                       jint *m_ints) {
3539   assert(len % 2 == 0, "array length in montgomery_square must be even");
3540   int longwords = len/2;
3541 
3542   // Make very sure we don't use so much space that the stack might
3543   // overflow.  512 jints corresponds to an 16384-bit integer and
3544   // will use here a total of 6k bytes of stack space.
3545   int divisor = sizeof(julong) * 3;
3546   guarantee(longwords <= (8192 / divisor), "must be");
3547   int total_allocation = longwords * sizeof (julong) * 3;
3548   julong *scratch = (julong *)alloca(total_allocation);
3549 
3550   // Local scratch arrays
3551   julong
3552     *a = scratch + 0 * longwords,
3553     *n = scratch + 1 * longwords,
3554     *m = scratch + 2 * longwords;
3555 
3556   reverse_words((julong *)a_ints, a, longwords);
3557   reverse_words((julong *)n_ints, n, longwords);
3558 
3559   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3560     ::montgomery_square(a, n, m, (julong)inv, longwords);
3561   } else {
3562     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3563   }
3564 
3565   reverse_words(m, (julong *)m_ints, longwords);
3566 }
3567 
3568 #ifdef COMPILER2
3569 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3570 //
3571 //------------------------------generate_exception_blob---------------------------
3572 // creates exception blob at the end
3573 // Using exception blob, this code is jumped from a compiled method.
3574 // (see emit_exception_handler in x86_64.ad file)
3575 //
3576 // Given an exception pc at a call we call into the runtime for the
3577 // handler in this method. This handler might merely restore state
3578 // (i.e. callee save registers) unwind the frame and jump to the
3579 // exception handler for the nmethod if there is no Java level handler
3580 // for the nmethod.
3581 //
3582 // This code is entered with a jmp.
3583 //
3584 // Arguments:
3585 //   rax: exception oop
3586 //   rdx: exception pc
3587 //
3588 // Results:
3589 //   rax: exception oop
3590 //   rdx: exception pc in caller or ???
3591 //   destination: exception handler of caller
3592 //
3593 // Note: the exception pc MUST be at a call (precise debug information)
3594 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3595 //
3596 
3597 void OptoRuntime::generate_exception_blob() {
3598   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3599   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3600   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3601 
3602   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3603 
3604   // Allocate space for the code
3605   ResourceMark rm;
3606   // Setup code generation tools
3607   CodeBuffer buffer("exception_blob", 2048, 1024);
3608   MacroAssembler* masm = new MacroAssembler(&buffer);
3609 
3610 
3611   address start = __ pc();
3612 
3613   // Exception pc is 'return address' for stack walker
3614   __ push(rdx);
3615   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3616 
3617   // Save callee-saved registers.  See x86_64.ad.
3618 
3619   // rbp is an implicitly saved callee saved register (i.e., the calling
3620   // convention will save/restore it in the prolog/epilog). Other than that
3621   // there are no callee save registers now that adapter frames are gone.
3622 
3623   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3624 
3625   // Store exception in Thread object. We cannot pass any arguments to the
3626   // handle_exception call, since we do not want to make any assumption
3627   // about the size of the frame where the exception happened in.
3628   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3629   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3630   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3631 
3632   // This call does all the hard work.  It checks if an exception handler
3633   // exists in the method.
3634   // If so, it returns the handler address.
3635   // If not, it prepares for stack-unwinding, restoring the callee-save
3636   // registers of the frame being removed.
3637   //
3638   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3639 
3640   // At a method handle call, the stack may not be properly aligned
3641   // when returning with an exception.
3642   address the_pc = __ pc();
3643   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3644   __ mov(c_rarg0, r15_thread);
3645   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3646   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3647 
3648   // Set an oopmap for the call site.  This oopmap will only be used if we
3649   // are unwinding the stack.  Hence, all locations will be dead.
3650   // Callee-saved registers will be the same as the frame above (i.e.,
3651   // handle_exception_stub), since they were restored when we got the
3652   // exception.
3653 
3654   OopMapSet* oop_maps = new OopMapSet();
3655 
3656   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3657 
3658   __ reset_last_Java_frame(false);
3659 
3660   // Restore callee-saved registers
3661 
3662   // rbp is an implicitly saved callee-saved register (i.e., the calling
3663   // convention will save restore it in prolog/epilog) Other than that
3664   // there are no callee save registers now that adapter frames are gone.
3665 
3666   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3667 
3668   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3669   __ pop(rdx);                  // No need for exception pc anymore
3670 
3671   // rax: exception handler
3672 
3673   // We have a handler in rax (could be deopt blob).
3674   __ mov(r8, rax);
3675 
3676   // Get the exception oop
3677   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3678   // Get the exception pc in case we are deoptimized
3679   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3680 #ifdef ASSERT
3681   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3682   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3683 #endif
3684   // Clear the exception oop so GC no longer processes it as a root.
3685   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3686 
3687   // rax: exception oop
3688   // r8:  exception handler
3689   // rdx: exception pc
3690   // Jump to handler
3691 
3692   __ jmp(r8);
3693 
3694   // Make sure all code is generated
3695   masm->flush();
3696 
3697   // Set exception blob
3698   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3699 }
3700 #endif // COMPILER2
3701