1 /*
   2  * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/icBuffer.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/compiledICHolder.hpp"
  46 #include "oops/klass.inline.hpp"
  47 #include "oops/method.inline.hpp"
  48 #include "prims/methodHandles.hpp"
  49 #include "runtime/continuation.hpp"
  50 #include "runtime/continuationEntry.inline.hpp"
  51 #include "runtime/globals.hpp"
  52 #include "runtime/jniHandles.hpp"
  53 #include "runtime/safepointMechanism.hpp"
  54 #include "runtime/sharedRuntime.hpp"
  55 #include "runtime/signature.hpp"
  56 #include "runtime/stubRoutines.hpp"
  57 #include "runtime/vframeArray.hpp"
  58 #include "runtime/vm_version.hpp"
  59 #include "utilities/align.hpp"
  60 #include "utilities/formatBuffer.hpp"
  61 #include "vmreg_x86.inline.hpp"
  62 #ifdef COMPILER1
  63 #include "c1/c1_Runtime1.hpp"
  64 #endif
  65 #ifdef COMPILER2
  66 #include "opto/runtime.hpp"
  67 #endif
  68 #if INCLUDE_JVMCI
  69 #include "jvmci/jvmciJavaClasses.hpp"
  70 #endif
  71 
  72 #define __ masm->
  73 
  74 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  75 
  76 class SimpleRuntimeFrame {
  77 
  78   public:
  79 
  80   // Most of the runtime stubs have this simple frame layout.
  81   // This class exists to make the layout shared in one place.
  82   // Offsets are for compiler stack slots, which are jints.
  83   enum layout {
  84     // The frame sender code expects that rbp will be in the "natural" place and
  85     // will override any oopMap setting for it. We must therefore force the layout
  86     // so that it agrees with the frame sender code.
  87     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  88     rbp_off2,
  89     return_off, return_off2,
  90     framesize
  91   };
  92 };
  93 
  94 class RegisterSaver {
  95   // Capture info about frame layout.  Layout offsets are in jint
  96   // units because compiler frame slots are jints.
  97 #define XSAVE_AREA_BEGIN 160
  98 #define XSAVE_AREA_YMM_BEGIN 576
  99 #define XSAVE_AREA_OPMASK_BEGIN 1088
 100 #define XSAVE_AREA_ZMM_BEGIN 1152
 101 #define XSAVE_AREA_UPPERBANK 1664
 102 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 103 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 104 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 105 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 107   enum layout {
 108     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 109     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 110     DEF_XMM_OFFS(0),
 111     DEF_XMM_OFFS(1),
 112     // 2..15 are implied in range usage
 113     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 114     DEF_YMM_OFFS(0),
 115     DEF_YMM_OFFS(1),
 116     // 2..15 are implied in range usage
 117     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 118     DEF_OPMASK_OFFS(0),
 119     DEF_OPMASK_OFFS(1),
 120     // 2..7 are implied in range usage
 121     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 122     DEF_ZMM_OFFS(0),
 123     DEF_ZMM_OFFS(1),
 124     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 125     DEF_ZMM_UPPER_OFFS(16),
 126     DEF_ZMM_UPPER_OFFS(17),
 127     // 18..31 are implied in range usage
 128     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 129     fpu_stateH_end,
 130     r15_off, r15H_off,
 131     r14_off, r14H_off,
 132     r13_off, r13H_off,
 133     r12_off, r12H_off,
 134     r11_off, r11H_off,
 135     r10_off, r10H_off,
 136     r9_off,  r9H_off,
 137     r8_off,  r8H_off,
 138     rdi_off, rdiH_off,
 139     rsi_off, rsiH_off,
 140     ignore_off, ignoreH_off,  // extra copy of rbp
 141     rsp_off, rspH_off,
 142     rbx_off, rbxH_off,
 143     rdx_off, rdxH_off,
 144     rcx_off, rcxH_off,
 145     rax_off, raxH_off,
 146     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 147     align_off, alignH_off,
 148     flags_off, flagsH_off,
 149     // The frame sender code expects that rbp will be in the "natural" place and
 150     // will override any oopMap setting for it. We must therefore force the layout
 151     // so that it agrees with the frame sender code.
 152     rbp_off, rbpH_off,        // copy of rbp we will restore
 153     return_off, returnH_off,  // slot for return address
 154     reg_save_size             // size in compiler stack slots
 155   };
 156 
 157  public:
 158   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 159   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 160 
 161   // Offsets into the register save area
 162   // Used by deoptimization when it is managing result register
 163   // values on its own
 164 
 165   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 166   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 167   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 168   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 169   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 170 
 171   // During deoptimization only the result registers need to be restored,
 172   // all the other values have already been extracted.
 173   static void restore_result_registers(MacroAssembler* masm);
 174 };
 175 
 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 177   int off = 0;
 178   int num_xmm_regs = XMMRegister::available_xmm_registers();
 179 #if COMPILER2_OR_JVMCI
 180   if (save_wide_vectors && UseAVX == 0) {
 181     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 182   }
 183   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 184 #else
 185   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 186 #endif
 187 
 188   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 189   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 190   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 191   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 192   // CodeBlob frame size is in words.
 193   int frame_size_in_words = frame_size_in_bytes / wordSize;
 194   *total_frame_words = frame_size_in_words;
 195 
 196   // Save registers, fpu state, and flags.
 197   // We assume caller has already pushed the return address onto the
 198   // stack, so rsp is 8-byte aligned here.
 199   // We push rpb twice in this sequence because we want the real rbp
 200   // to be under the return like a normal enter.
 201 
 202   __ enter();          // rsp becomes 16-byte aligned here
 203   __ push_CPU_state(); // Push a multiple of 16 bytes
 204 
 205   // push cpu state handles this on EVEX enabled targets
 206   if (save_wide_vectors) {
 207     // Save upper half of YMM registers(0..15)
 208     int base_addr = XSAVE_AREA_YMM_BEGIN;
 209     for (int n = 0; n < 16; n++) {
 210       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 211     }
 212     if (VM_Version::supports_evex()) {
 213       // Save upper half of ZMM registers(0..15)
 214       base_addr = XSAVE_AREA_ZMM_BEGIN;
 215       for (int n = 0; n < 16; n++) {
 216         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 217       }
 218       // Save full ZMM registers(16..num_xmm_regs)
 219       base_addr = XSAVE_AREA_UPPERBANK;
 220       off = 0;
 221       int vector_len = Assembler::AVX_512bit;
 222       for (int n = 16; n < num_xmm_regs; n++) {
 223         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 224       }
 225 #if COMPILER2_OR_JVMCI
 226       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 227       off = 0;
 228       for(int n = 0; n < KRegister::number_of_registers; n++) {
 229         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 230       }
 231 #endif
 232     }
 233   } else {
 234     if (VM_Version::supports_evex()) {
 235       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 236       int base_addr = XSAVE_AREA_UPPERBANK;
 237       off = 0;
 238       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 239       for (int n = 16; n < num_xmm_regs; n++) {
 240         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 241       }
 242 #if COMPILER2_OR_JVMCI
 243       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 244       off = 0;
 245       for(int n = 0; n < KRegister::number_of_registers; n++) {
 246         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 247       }
 248 #endif
 249     }
 250   }
 251   __ vzeroupper();
 252   if (frame::arg_reg_save_area_bytes != 0) {
 253     // Allocate argument register save area
 254     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 255   }
 256 
 257   // Set an oopmap for the call site.  This oopmap will map all
 258   // oop-registers and debug-info registers as callee-saved.  This
 259   // will allow deoptimization at this safepoint to find all possible
 260   // debug-info recordings, as well as let GC find all oops.
 261 
 262   OopMapSet *oop_maps = new OopMapSet();
 263   OopMap* map = new OopMap(frame_size_in_slots, 0);
 264 
 265 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 266 
 267   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 271   // rbp location is known implicitly by the frame sender code, needs no oopmap
 272   // and the location where rbp was saved by is ignored
 273   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 283   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 284   // on EVEX enabled targets, we get it included in the xsave area
 285   off = xmm0_off;
 286   int delta = xmm1_off - off;
 287   for (int n = 0; n < 16; n++) {
 288     XMMRegister xmm_name = as_XMMRegister(n);
 289     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 290     off += delta;
 291   }
 292   if (UseAVX > 2) {
 293     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 294     off = zmm16_off;
 295     delta = zmm17_off - off;
 296     for (int n = 16; n < num_xmm_regs; n++) {
 297       XMMRegister zmm_name = as_XMMRegister(n);
 298       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 299       off += delta;
 300     }
 301   }
 302 
 303 #if COMPILER2_OR_JVMCI
 304   if (save_wide_vectors) {
 305     // Save upper half of YMM registers(0..15)
 306     off = ymm0_off;
 307     delta = ymm1_off - ymm0_off;
 308     for (int n = 0; n < 16; n++) {
 309       XMMRegister ymm_name = as_XMMRegister(n);
 310       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 311       off += delta;
 312     }
 313     if (VM_Version::supports_evex()) {
 314       // Save upper half of ZMM registers(0..15)
 315       off = zmm0_off;
 316       delta = zmm1_off - zmm0_off;
 317       for (int n = 0; n < 16; n++) {
 318         XMMRegister zmm_name = as_XMMRegister(n);
 319         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 320         off += delta;
 321       }
 322     }
 323   }
 324 #endif // COMPILER2_OR_JVMCI
 325 
 326   // %%% These should all be a waste but we'll keep things as they were for now
 327   if (true) {
 328     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 332     // rbp location is known implicitly by the frame sender code, needs no oopmap
 333     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 343     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 344     // on EVEX enabled targets, we get it included in the xsave area
 345     off = xmm0H_off;
 346     delta = xmm1H_off - off;
 347     for (int n = 0; n < 16; n++) {
 348       XMMRegister xmm_name = as_XMMRegister(n);
 349       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 350       off += delta;
 351     }
 352     if (UseAVX > 2) {
 353       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 354       off = zmm16H_off;
 355       delta = zmm17H_off - off;
 356       for (int n = 16; n < num_xmm_regs; n++) {
 357         XMMRegister zmm_name = as_XMMRegister(n);
 358         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 359         off += delta;
 360       }
 361     }
 362   }
 363 
 364   return map;
 365 }
 366 
 367 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 368   int num_xmm_regs = XMMRegister::available_xmm_registers();
 369   if (frame::arg_reg_save_area_bytes != 0) {
 370     // Pop arg register save area
 371     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 372   }
 373 
 374 #if COMPILER2_OR_JVMCI
 375   if (restore_wide_vectors) {
 376     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 377     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 378   }
 379 #else
 380   assert(!restore_wide_vectors, "vectors are generated only by C2");
 381 #endif
 382 
 383   __ vzeroupper();
 384 
 385   // On EVEX enabled targets everything is handled in pop fpu state
 386   if (restore_wide_vectors) {
 387     // Restore upper half of YMM registers (0..15)
 388     int base_addr = XSAVE_AREA_YMM_BEGIN;
 389     for (int n = 0; n < 16; n++) {
 390       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 391     }
 392     if (VM_Version::supports_evex()) {
 393       // Restore upper half of ZMM registers (0..15)
 394       base_addr = XSAVE_AREA_ZMM_BEGIN;
 395       for (int n = 0; n < 16; n++) {
 396         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 397       }
 398       // Restore full ZMM registers(16..num_xmm_regs)
 399       base_addr = XSAVE_AREA_UPPERBANK;
 400       int vector_len = Assembler::AVX_512bit;
 401       int off = 0;
 402       for (int n = 16; n < num_xmm_regs; n++) {
 403         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 404       }
 405 #if COMPILER2_OR_JVMCI
 406       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 407       off = 0;
 408       for (int n = 0; n < KRegister::number_of_registers; n++) {
 409         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 410       }
 411 #endif
 412     }
 413   } else {
 414     if (VM_Version::supports_evex()) {
 415       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 416       int base_addr = XSAVE_AREA_UPPERBANK;
 417       int off = 0;
 418       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 419       for (int n = 16; n < num_xmm_regs; n++) {
 420         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 421       }
 422 #if COMPILER2_OR_JVMCI
 423       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 424       off = 0;
 425       for (int n = 0; n < KRegister::number_of_registers; n++) {
 426         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 427       }
 428 #endif
 429     }
 430   }
 431 
 432   // Recover CPU state
 433   __ pop_CPU_state();
 434   // Get the rbp described implicitly by the calling convention (no oopMap)
 435   __ pop(rbp);
 436 }
 437 
 438 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 439 
 440   // Just restore result register. Only used by deoptimization. By
 441   // now any callee save register that needs to be restored to a c2
 442   // caller of the deoptee has been extracted into the vframeArray
 443   // and will be stuffed into the c2i adapter we create for later
 444   // restoration so only result registers need to be restored here.
 445 
 446   // Restore fp result register
 447   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 448   // Restore integer result register
 449   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 450   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 451 
 452   // Pop all of the register save are off the stack except the return address
 453   __ addptr(rsp, return_offset_in_bytes());
 454 }
 455 
 456 // Is vector's size (in bytes) bigger than a size saved by default?
 457 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 458 bool SharedRuntime::is_wide_vector(int size) {
 459   return size > 16;
 460 }
 461 
 462 // ---------------------------------------------------------------------------
 463 // Read the array of BasicTypes from a signature, and compute where the
 464 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 465 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 466 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 467 // as framesizes are fixed.
 468 // VMRegImpl::stack0 refers to the first slot 0(sp).
 469 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 470 // Register up to Register::number_of_registers are the 64-bit
 471 // integer registers.
 472 
 473 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 474 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 475 // units regardless of build. Of course for i486 there is no 64 bit build
 476 
 477 // The Java calling convention is a "shifted" version of the C ABI.
 478 // By skipping the first C ABI register we can call non-static jni methods
 479 // with small numbers of arguments without having to shuffle the arguments
 480 // at all. Since we control the java ABI we ought to at least get some
 481 // advantage out of it.
 482 
 483 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 484                                            VMRegPair *regs,
 485                                            int total_args_passed) {
 486 
 487   // Create the mapping between argument positions and
 488   // registers.
 489   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 490     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 491   };
 492   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 493     j_farg0, j_farg1, j_farg2, j_farg3,
 494     j_farg4, j_farg5, j_farg6, j_farg7
 495   };
 496 
 497 
 498   uint int_args = 0;
 499   uint fp_args = 0;
 500   uint stk_args = 0; // inc by 2 each time
 501 
 502   for (int i = 0; i < total_args_passed; i++) {
 503     switch (sig_bt[i]) {
 504     case T_BOOLEAN:
 505     case T_CHAR:
 506     case T_BYTE:
 507     case T_SHORT:
 508     case T_INT:
 509       if (int_args < Argument::n_int_register_parameters_j) {
 510         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 511       } else {
 512         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 513         stk_args += 2;
 514       }
 515       break;
 516     case T_VOID:
 517       // halves of T_LONG or T_DOUBLE
 518       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 519       regs[i].set_bad();
 520       break;
 521     case T_LONG:
 522       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 523       // fall through
 524     case T_OBJECT:
 525     case T_ARRAY:
 526     case T_ADDRESS:
 527       if (int_args < Argument::n_int_register_parameters_j) {
 528         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 529       } else {
 530         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 531         stk_args += 2;
 532       }
 533       break;
 534     case T_FLOAT:
 535       if (fp_args < Argument::n_float_register_parameters_j) {
 536         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 537       } else {
 538         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 539         stk_args += 2;
 540       }
 541       break;
 542     case T_DOUBLE:
 543       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 544       if (fp_args < Argument::n_float_register_parameters_j) {
 545         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 546       } else {
 547         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 548         stk_args += 2;
 549       }
 550       break;
 551     default:
 552       ShouldNotReachHere();
 553       break;
 554     }
 555   }
 556 
 557   return align_up(stk_args, 2);
 558 }
 559 
 560 // Patch the callers callsite with entry to compiled code if it exists.
 561 static void patch_callers_callsite(MacroAssembler *masm) {
 562   Label L;
 563   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 564   __ jcc(Assembler::equal, L);
 565 
 566   // Save the current stack pointer
 567   __ mov(r13, rsp);
 568   // Schedule the branch target address early.
 569   // Call into the VM to patch the caller, then jump to compiled callee
 570   // rax isn't live so capture return address while we easily can
 571   __ movptr(rax, Address(rsp, 0));
 572 
 573   // align stack so push_CPU_state doesn't fault
 574   __ andptr(rsp, -(StackAlignmentInBytes));
 575   __ push_CPU_state();
 576   __ vzeroupper();
 577   // VM needs caller's callsite
 578   // VM needs target method
 579   // This needs to be a long call since we will relocate this adapter to
 580   // the codeBuffer and it may not reach
 581 
 582   // Allocate argument register save area
 583   if (frame::arg_reg_save_area_bytes != 0) {
 584     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 585   }
 586   __ mov(c_rarg0, rbx);
 587   __ mov(c_rarg1, rax);
 588   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 589 
 590   // De-allocate argument register save area
 591   if (frame::arg_reg_save_area_bytes != 0) {
 592     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 593   }
 594 
 595   __ vzeroupper();
 596   __ pop_CPU_state();
 597   // restore sp
 598   __ mov(rsp, r13);
 599   __ bind(L);
 600 }
 601 
 602 
 603 static void gen_c2i_adapter(MacroAssembler *masm,
 604                             int total_args_passed,
 605                             int comp_args_on_stack,
 606                             const BasicType *sig_bt,
 607                             const VMRegPair *regs,
 608                             Label& skip_fixup) {
 609   // Before we get into the guts of the C2I adapter, see if we should be here
 610   // at all.  We've come from compiled code and are attempting to jump to the
 611   // interpreter, which means the caller made a static call to get here
 612   // (vcalls always get a compiled target if there is one).  Check for a
 613   // compiled target.  If there is one, we need to patch the caller's call.
 614   patch_callers_callsite(masm);
 615 
 616   __ bind(skip_fixup);
 617 
 618   // Since all args are passed on the stack, total_args_passed *
 619   // Interpreter::stackElementSize is the space we need.
 620 
 621   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 622 
 623   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 624 
 625   // stack is aligned, keep it that way
 626   // This is not currently needed or enforced by the interpreter, but
 627   // we might as well conform to the ABI.
 628   extraspace = align_up(extraspace, 2*wordSize);
 629 
 630   // set senderSP value
 631   __ lea(r13, Address(rsp, wordSize));
 632 
 633 #ifdef ASSERT
 634   __ check_stack_alignment(r13, "sender stack not aligned");
 635 #endif
 636   if (extraspace > 0) {
 637     // Pop the return address
 638     __ pop(rax);
 639 
 640     __ subptr(rsp, extraspace);
 641 
 642     // Push the return address
 643     __ push(rax);
 644 
 645     // Account for the return address location since we store it first rather
 646     // than hold it in a register across all the shuffling
 647     extraspace += wordSize;
 648   }
 649 
 650 #ifdef ASSERT
 651   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 652 #endif
 653 
 654   // Now write the args into the outgoing interpreter space
 655   for (int i = 0; i < total_args_passed; i++) {
 656     if (sig_bt[i] == T_VOID) {
 657       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 658       continue;
 659     }
 660 
 661     // offset to start parameters
 662     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 663     int next_off = st_off - Interpreter::stackElementSize;
 664 
 665     // Say 4 args:
 666     // i   st_off
 667     // 0   32 T_LONG
 668     // 1   24 T_VOID
 669     // 2   16 T_OBJECT
 670     // 3    8 T_BOOL
 671     // -    0 return address
 672     //
 673     // However to make thing extra confusing. Because we can fit a long/double in
 674     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 675     // leaves one slot empty and only stores to a single slot. In this case the
 676     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 677 
 678     VMReg r_1 = regs[i].first();
 679     VMReg r_2 = regs[i].second();
 680     if (!r_1->is_valid()) {
 681       assert(!r_2->is_valid(), "");
 682       continue;
 683     }
 684     if (r_1->is_stack()) {
 685       // memory to memory use rax
 686       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 687       if (!r_2->is_valid()) {
 688         // sign extend??
 689         __ movl(rax, Address(rsp, ld_off));
 690         __ movptr(Address(rsp, st_off), rax);
 691 
 692       } else {
 693 
 694         __ movq(rax, Address(rsp, ld_off));
 695 
 696         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 697         // T_DOUBLE and T_LONG use two slots in the interpreter
 698         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 699           // ld_off == LSW, ld_off+wordSize == MSW
 700           // st_off == MSW, next_off == LSW
 701           __ movq(Address(rsp, next_off), rax);
 702 #ifdef ASSERT
 703           // Overwrite the unused slot with known junk
 704           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 705           __ movptr(Address(rsp, st_off), rax);
 706 #endif /* ASSERT */
 707         } else {
 708           __ movq(Address(rsp, st_off), rax);
 709         }
 710       }
 711     } else if (r_1->is_Register()) {
 712       Register r = r_1->as_Register();
 713       if (!r_2->is_valid()) {
 714         // must be only an int (or less ) so move only 32bits to slot
 715         // why not sign extend??
 716         __ movl(Address(rsp, st_off), r);
 717       } else {
 718         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 719         // T_DOUBLE and T_LONG use two slots in the interpreter
 720         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 721           // long/double in gpr
 722 #ifdef ASSERT
 723           // Overwrite the unused slot with known junk
 724           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 725           __ movptr(Address(rsp, st_off), rax);
 726 #endif /* ASSERT */
 727           __ movq(Address(rsp, next_off), r);
 728         } else {
 729           __ movptr(Address(rsp, st_off), r);
 730         }
 731       }
 732     } else {
 733       assert(r_1->is_XMMRegister(), "");
 734       if (!r_2->is_valid()) {
 735         // only a float use just part of the slot
 736         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 737       } else {
 738 #ifdef ASSERT
 739         // Overwrite the unused slot with known junk
 740         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 741         __ movptr(Address(rsp, st_off), rax);
 742 #endif /* ASSERT */
 743         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 744       }
 745     }
 746   }
 747 
 748   // Schedule the branch target address early.
 749   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 750   __ jmp(rcx);
 751 }
 752 
 753 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 754                         address code_start, address code_end,
 755                         Label& L_ok) {
 756   Label L_fail;
 757   __ lea(temp_reg, ExternalAddress(code_start));
 758   __ cmpptr(pc_reg, temp_reg);
 759   __ jcc(Assembler::belowEqual, L_fail);
 760   __ lea(temp_reg, ExternalAddress(code_end));
 761   __ cmpptr(pc_reg, temp_reg);
 762   __ jcc(Assembler::below, L_ok);
 763   __ bind(L_fail);
 764 }
 765 
 766 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 767                                     int total_args_passed,
 768                                     int comp_args_on_stack,
 769                                     const BasicType *sig_bt,
 770                                     const VMRegPair *regs) {
 771 
 772   // Note: r13 contains the senderSP on entry. We must preserve it since
 773   // we may do a i2c -> c2i transition if we lose a race where compiled
 774   // code goes non-entrant while we get args ready.
 775   // In addition we use r13 to locate all the interpreter args as
 776   // we must align the stack to 16 bytes on an i2c entry else we
 777   // lose alignment we expect in all compiled code and register
 778   // save code can segv when fxsave instructions find improperly
 779   // aligned stack pointer.
 780 
 781   // Adapters can be frameless because they do not require the caller
 782   // to perform additional cleanup work, such as correcting the stack pointer.
 783   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 784   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 785   // even if a callee has modified the stack pointer.
 786   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 787   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 788   // up via the senderSP register).
 789   // In other words, if *either* the caller or callee is interpreted, we can
 790   // get the stack pointer repaired after a call.
 791   // This is why c2i and i2c adapters cannot be indefinitely composed.
 792   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 793   // both caller and callee would be compiled methods, and neither would
 794   // clean up the stack pointer changes performed by the two adapters.
 795   // If this happens, control eventually transfers back to the compiled
 796   // caller, but with an uncorrected stack, causing delayed havoc.
 797 
 798   if (VerifyAdapterCalls &&
 799       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 800     // So, let's test for cascading c2i/i2c adapters right now.
 801     //  assert(Interpreter::contains($return_addr) ||
 802     //         StubRoutines::contains($return_addr),
 803     //         "i2c adapter must return to an interpreter frame");
 804     __ block_comment("verify_i2c { ");
 805     // Pick up the return address
 806     __ movptr(rax, Address(rsp, 0));
 807     Label L_ok;
 808     if (Interpreter::code() != NULL)
 809       range_check(masm, rax, r11,
 810                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 811                   L_ok);
 812     if (StubRoutines::code1() != NULL)
 813       range_check(masm, rax, r11,
 814                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 815                   L_ok);
 816     if (StubRoutines::code2() != NULL)
 817       range_check(masm, rax, r11,
 818                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 819                   L_ok);
 820     const char* msg = "i2c adapter must return to an interpreter frame";
 821     __ block_comment(msg);
 822     __ stop(msg);
 823     __ bind(L_ok);
 824     __ block_comment("} verify_i2ce ");
 825   }
 826 
 827   // Must preserve original SP for loading incoming arguments because
 828   // we need to align the outgoing SP for compiled code.
 829   __ movptr(r11, rsp);
 830 
 831   // Pick up the return address
 832   __ pop(rax);
 833 
 834   // Convert 4-byte c2 stack slots to words.
 835   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 836 
 837   if (comp_args_on_stack) {
 838     __ subptr(rsp, comp_words_on_stack * wordSize);
 839   }
 840 
 841   // Ensure compiled code always sees stack at proper alignment
 842   __ andptr(rsp, -16);
 843 
 844   // push the return address and misalign the stack that youngest frame always sees
 845   // as far as the placement of the call instruction
 846   __ push(rax);
 847 
 848   // Put saved SP in another register
 849   const Register saved_sp = rax;
 850   __ movptr(saved_sp, r11);
 851 
 852   // Will jump to the compiled code just as if compiled code was doing it.
 853   // Pre-load the register-jump target early, to schedule it better.
 854   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 855 
 856 #if INCLUDE_JVMCI
 857   if (EnableJVMCI) {
 858     // check if this call should be routed towards a specific entry point
 859     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 860     Label no_alternative_target;
 861     __ jcc(Assembler::equal, no_alternative_target);
 862     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 863     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 864     __ bind(no_alternative_target);
 865   }
 866 #endif // INCLUDE_JVMCI
 867 
 868   // Now generate the shuffle code.  Pick up all register args and move the
 869   // rest through the floating point stack top.
 870   for (int i = 0; i < total_args_passed; i++) {
 871     if (sig_bt[i] == T_VOID) {
 872       // Longs and doubles are passed in native word order, but misaligned
 873       // in the 32-bit build.
 874       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 875       continue;
 876     }
 877 
 878     // Pick up 0, 1 or 2 words from SP+offset.
 879 
 880     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 881             "scrambled load targets?");
 882     // Load in argument order going down.
 883     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 884     // Point to interpreter value (vs. tag)
 885     int next_off = ld_off - Interpreter::stackElementSize;
 886     //
 887     //
 888     //
 889     VMReg r_1 = regs[i].first();
 890     VMReg r_2 = regs[i].second();
 891     if (!r_1->is_valid()) {
 892       assert(!r_2->is_valid(), "");
 893       continue;
 894     }
 895     if (r_1->is_stack()) {
 896       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 897       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 898 
 899       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 900       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 901       // will be generated.
 902       if (!r_2->is_valid()) {
 903         // sign extend???
 904         __ movl(r13, Address(saved_sp, ld_off));
 905         __ movptr(Address(rsp, st_off), r13);
 906       } else {
 907         //
 908         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 909         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 910         // So we must adjust where to pick up the data to match the interpreter.
 911         //
 912         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 913         // are accessed as negative so LSW is at LOW address
 914 
 915         // ld_off is MSW so get LSW
 916         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 917                            next_off : ld_off;
 918         __ movq(r13, Address(saved_sp, offset));
 919         // st_off is LSW (i.e. reg.first())
 920         __ movq(Address(rsp, st_off), r13);
 921       }
 922     } else if (r_1->is_Register()) {  // Register argument
 923       Register r = r_1->as_Register();
 924       assert(r != rax, "must be different");
 925       if (r_2->is_valid()) {
 926         //
 927         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 928         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 929         // So we must adjust where to pick up the data to match the interpreter.
 930 
 931         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 932                            next_off : ld_off;
 933 
 934         // this can be a misaligned move
 935         __ movq(r, Address(saved_sp, offset));
 936       } else {
 937         // sign extend and use a full word?
 938         __ movl(r, Address(saved_sp, ld_off));
 939       }
 940     } else {
 941       if (!r_2->is_valid()) {
 942         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 943       } else {
 944         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 945       }
 946     }
 947   }
 948 
 949   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 950 
 951   // 6243940 We might end up in handle_wrong_method if
 952   // the callee is deoptimized as we race thru here. If that
 953   // happens we don't want to take a safepoint because the
 954   // caller frame will look interpreted and arguments are now
 955   // "compiled" so it is much better to make this transition
 956   // invisible to the stack walking code. Unfortunately if
 957   // we try and find the callee by normal means a safepoint
 958   // is possible. So we stash the desired callee in the thread
 959   // and the vm will find there should this case occur.
 960 
 961   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 962 
 963   // put Method* where a c2i would expect should we end up there
 964   // only needed because eof c2 resolve stubs return Method* as a result in
 965   // rax
 966   __ mov(rax, rbx);
 967   __ jmp(r11);
 968 }
 969 
 970 // ---------------------------------------------------------------
 971 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 972                                                             int total_args_passed,
 973                                                             int comp_args_on_stack,
 974                                                             const BasicType *sig_bt,
 975                                                             const VMRegPair *regs,
 976                                                             AdapterFingerPrint* fingerprint) {
 977   address i2c_entry = __ pc();
 978 
 979   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 980 
 981   // -------------------------------------------------------------------------
 982   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 983   // to the interpreter.  The args start out packed in the compiled layout.  They
 984   // need to be unpacked into the interpreter layout.  This will almost always
 985   // require some stack space.  We grow the current (compiled) stack, then repack
 986   // the args.  We  finally end in a jump to the generic interpreter entry point.
 987   // On exit from the interpreter, the interpreter will restore our SP (lest the
 988   // compiled code, which relies solely on SP and not RBP, get sick).
 989 
 990   address c2i_unverified_entry = __ pc();
 991   Label skip_fixup;
 992   Label ok;
 993 
 994   Register holder = rax;
 995   Register receiver = j_rarg0;
 996   Register temp = rbx;
 997 
 998   {
 999     __ load_klass(temp, receiver, rscratch1);
1000     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
1001     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
1002     __ jcc(Assembler::equal, ok);
1003     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1004 
1005     __ bind(ok);
1006     // Method might have been compiled since the call site was patched to
1007     // interpreted if that is the case treat it as a miss so we can get
1008     // the call site corrected.
1009     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1010     __ jcc(Assembler::equal, skip_fixup);
1011     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1012   }
1013 
1014   address c2i_entry = __ pc();
1015 
1016   // Class initialization barrier for static methods
1017   address c2i_no_clinit_check_entry = NULL;
1018   if (VM_Version::supports_fast_class_init_checks()) {
1019     Label L_skip_barrier;
1020     Register method = rbx;
1021 
1022     { // Bypass the barrier for non-static methods
1023       Register flags = rscratch1;
1024       __ movl(flags, Address(method, Method::access_flags_offset()));
1025       __ testl(flags, JVM_ACC_STATIC);
1026       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1027     }
1028 
1029     Register klass = rscratch1;
1030     __ load_method_holder(klass, method);
1031     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1032 
1033     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1034 
1035     __ bind(L_skip_barrier);
1036     c2i_no_clinit_check_entry = __ pc();
1037   }
1038 
1039   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1040   bs->c2i_entry_barrier(masm);
1041 
1042   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1043 
1044   __ flush();
1045   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1046 }
1047 
1048 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1049                                          VMRegPair *regs,
1050                                          VMRegPair *regs2,
1051                                          int total_args_passed) {
1052   assert(regs2 == NULL, "not needed on x86");
1053 // We return the amount of VMRegImpl stack slots we need to reserve for all
1054 // the arguments NOT counting out_preserve_stack_slots.
1055 
1056 // NOTE: These arrays will have to change when c1 is ported
1057 #ifdef _WIN64
1058     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1059       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1060     };
1061     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1062       c_farg0, c_farg1, c_farg2, c_farg3
1063     };
1064 #else
1065     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1066       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1067     };
1068     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1069       c_farg0, c_farg1, c_farg2, c_farg3,
1070       c_farg4, c_farg5, c_farg6, c_farg7
1071     };
1072 #endif // _WIN64
1073 
1074 
1075     uint int_args = 0;
1076     uint fp_args = 0;
1077     uint stk_args = 0; // inc by 2 each time
1078 
1079     for (int i = 0; i < total_args_passed; i++) {
1080       switch (sig_bt[i]) {
1081       case T_BOOLEAN:
1082       case T_CHAR:
1083       case T_BYTE:
1084       case T_SHORT:
1085       case T_INT:
1086         if (int_args < Argument::n_int_register_parameters_c) {
1087           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1088 #ifdef _WIN64
1089           fp_args++;
1090           // Allocate slots for callee to stuff register args the stack.
1091           stk_args += 2;
1092 #endif
1093         } else {
1094           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1095           stk_args += 2;
1096         }
1097         break;
1098       case T_LONG:
1099         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1100         // fall through
1101       case T_OBJECT:
1102       case T_ARRAY:
1103       case T_ADDRESS:
1104       case T_METADATA:
1105         if (int_args < Argument::n_int_register_parameters_c) {
1106           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1107 #ifdef _WIN64
1108           fp_args++;
1109           stk_args += 2;
1110 #endif
1111         } else {
1112           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1113           stk_args += 2;
1114         }
1115         break;
1116       case T_FLOAT:
1117         if (fp_args < Argument::n_float_register_parameters_c) {
1118           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1119 #ifdef _WIN64
1120           int_args++;
1121           // Allocate slots for callee to stuff register args the stack.
1122           stk_args += 2;
1123 #endif
1124         } else {
1125           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1126           stk_args += 2;
1127         }
1128         break;
1129       case T_DOUBLE:
1130         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1131         if (fp_args < Argument::n_float_register_parameters_c) {
1132           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1133 #ifdef _WIN64
1134           int_args++;
1135           // Allocate slots for callee to stuff register args the stack.
1136           stk_args += 2;
1137 #endif
1138         } else {
1139           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1140           stk_args += 2;
1141         }
1142         break;
1143       case T_VOID: // Halves of longs and doubles
1144         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1145         regs[i].set_bad();
1146         break;
1147       default:
1148         ShouldNotReachHere();
1149         break;
1150       }
1151     }
1152 #ifdef _WIN64
1153   // windows abi requires that we always allocate enough stack space
1154   // for 4 64bit registers to be stored down.
1155   if (stk_args < 8) {
1156     stk_args = 8;
1157   }
1158 #endif // _WIN64
1159 
1160   return stk_args;
1161 }
1162 
1163 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1164                                              uint num_bits,
1165                                              uint total_args_passed) {
1166   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1167          "only certain vector sizes are supported for now");
1168 
1169   static const XMMRegister VEC_ArgReg[32] = {
1170      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1171      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1172     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1173     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1174   };
1175 
1176   uint stk_args = 0;
1177   uint fp_args = 0;
1178 
1179   for (uint i = 0; i < total_args_passed; i++) {
1180     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1181     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1182     regs[i].set_pair(vmreg->next(next_val), vmreg);
1183   }
1184 
1185   return stk_args;
1186 }
1187 
1188 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1189   // We always ignore the frame_slots arg and just use the space just below frame pointer
1190   // which by this time is free to use
1191   switch (ret_type) {
1192   case T_FLOAT:
1193     __ movflt(Address(rbp, -wordSize), xmm0);
1194     break;
1195   case T_DOUBLE:
1196     __ movdbl(Address(rbp, -wordSize), xmm0);
1197     break;
1198   case T_VOID:  break;
1199   default: {
1200     __ movptr(Address(rbp, -wordSize), rax);
1201     }
1202   }
1203 }
1204 
1205 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1206   // We always ignore the frame_slots arg and just use the space just below frame pointer
1207   // which by this time is free to use
1208   switch (ret_type) {
1209   case T_FLOAT:
1210     __ movflt(xmm0, Address(rbp, -wordSize));
1211     break;
1212   case T_DOUBLE:
1213     __ movdbl(xmm0, Address(rbp, -wordSize));
1214     break;
1215   case T_VOID:  break;
1216   default: {
1217     __ movptr(rax, Address(rbp, -wordSize));
1218     }
1219   }
1220 }
1221 
1222 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1223     for ( int i = first_arg ; i < arg_count ; i++ ) {
1224       if (args[i].first()->is_Register()) {
1225         __ push(args[i].first()->as_Register());
1226       } else if (args[i].first()->is_XMMRegister()) {
1227         __ subptr(rsp, 2*wordSize);
1228         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1229       }
1230     }
1231 }
1232 
1233 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1234     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1235       if (args[i].first()->is_Register()) {
1236         __ pop(args[i].first()->as_Register());
1237       } else if (args[i].first()->is_XMMRegister()) {
1238         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1239         __ addptr(rsp, 2*wordSize);
1240       }
1241     }
1242 }
1243 
1244 static void verify_oop_args(MacroAssembler* masm,
1245                             const methodHandle& method,
1246                             const BasicType* sig_bt,
1247                             const VMRegPair* regs) {
1248   Register temp_reg = rbx;  // not part of any compiled calling seq
1249   if (VerifyOops) {
1250     for (int i = 0; i < method->size_of_parameters(); i++) {
1251       if (is_reference_type(sig_bt[i])) {
1252         VMReg r = regs[i].first();
1253         assert(r->is_valid(), "bad oop arg");
1254         if (r->is_stack()) {
1255           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1256           __ verify_oop(temp_reg);
1257         } else {
1258           __ verify_oop(r->as_Register());
1259         }
1260       }
1261     }
1262   }
1263 }
1264 
1265 static void check_continuation_enter_argument(VMReg actual_vmreg,
1266                                               Register expected_reg,
1267                                               const char* name) {
1268   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1269   assert(actual_vmreg->as_Register() == expected_reg,
1270          "%s is in unexpected register: %s instead of %s",
1271          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1272 }
1273 
1274 
1275 //---------------------------- continuation_enter_setup ---------------------------
1276 //
1277 // Arguments:
1278 //   None.
1279 //
1280 // Results:
1281 //   rsp: pointer to blank ContinuationEntry
1282 //
1283 // Kills:
1284 //   rax
1285 //
1286 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1287   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1288   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1289   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1290 
1291   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1292   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1293 
1294   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1295   OopMap* map = new OopMap(frame_size, 0);
1296 
1297   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1298   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1299   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1300 
1301   return map;
1302 }
1303 
1304 //---------------------------- fill_continuation_entry ---------------------------
1305 //
1306 // Arguments:
1307 //   rsp: pointer to blank Continuation entry
1308 //   reg_cont_obj: pointer to the continuation
1309 //   reg_flags: flags
1310 //
1311 // Results:
1312 //   rsp: pointer to filled out ContinuationEntry
1313 //
1314 // Kills:
1315 //   rax
1316 //
1317 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1318   assert_different_registers(rax, reg_cont_obj, reg_flags);
1319 #ifdef ASSERT
1320   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1321 #endif
1322   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1323   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1324   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1325   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1326   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1327 
1328   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1329   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1330   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1331   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1332 
1333   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1334   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1335 }
1336 
1337 //---------------------------- continuation_enter_cleanup ---------------------------
1338 //
1339 // Arguments:
1340 //   rsp: pointer to the ContinuationEntry
1341 //
1342 // Results:
1343 //   rsp: pointer to the spilled rbp in the entry frame
1344 //
1345 // Kills:
1346 //   rbx
1347 //
1348 void static continuation_enter_cleanup(MacroAssembler* masm) {
1349 #ifdef ASSERT
1350   Label L_good_sp;
1351   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1352   __ jcc(Assembler::equal, L_good_sp);
1353   __ stop("Incorrect rsp at continuation_enter_cleanup");
1354   __ bind(L_good_sp);
1355 #endif
1356 
1357   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1358   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1359   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1360   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1361 
1362   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1363   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1364   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1365 }
1366 
1367 static void gen_continuation_enter(MacroAssembler* masm,
1368                                    const VMRegPair* regs,
1369                                    int& exception_offset,
1370                                    OopMapSet* oop_maps,
1371                                    int& frame_complete,
1372                                    int& stack_slots,
1373                                    int& interpreted_entry_offset,
1374                                    int& compiled_entry_offset) {
1375 
1376   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1377   int pos_cont_obj   = 0;
1378   int pos_is_cont    = 1;
1379   int pos_is_virtual = 2;
1380 
1381   // The platform-specific calling convention may present the arguments in various registers.
1382   // To simplify the rest of the code, we expect the arguments to reside at these known
1383   // registers, and we additionally check the placement here in case calling convention ever
1384   // changes.
1385   Register reg_cont_obj   = c_rarg1;
1386   Register reg_is_cont    = c_rarg2;
1387   Register reg_is_virtual = c_rarg3;
1388 
1389   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1390   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1391   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1392 
1393   // Utility methods kill rax, make sure there are no collisions
1394   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1395 
1396   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1397                          relocInfo::static_call_type);
1398 
1399   address start = __ pc();
1400 
1401   Label L_thaw, L_exit;
1402 
1403   // i2i entry used at interp_only_mode only
1404   interpreted_entry_offset = __ pc() - start;
1405   {
1406 #ifdef ASSERT
1407     Label is_interp_only;
1408     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1409     __ jcc(Assembler::notEqual, is_interp_only);
1410     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1411     __ bind(is_interp_only);
1412 #endif
1413 
1414     __ pop(rax); // return address
1415     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1416     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1417     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1418     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1419     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1420     __ push(rax); // return address
1421     __ push_cont_fastpath();
1422 
1423     __ enter();
1424 
1425     stack_slots = 2; // will be adjusted in setup
1426     OopMap* map = continuation_enter_setup(masm, stack_slots);
1427     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1428     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1429 
1430     __ verify_oop(reg_cont_obj);
1431 
1432     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1433 
1434     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1435     __ testptr(reg_is_cont, reg_is_cont);
1436     __ jcc(Assembler::notZero, L_thaw);
1437 
1438     // --- Resolve path
1439 
1440     // Make sure the call is patchable
1441     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1442     // Emit stub for static call
1443     CodeBuffer* cbuf = masm->code_section()->outer();
1444     address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1445     if (stub == nullptr) {
1446       fatal("CodeCache is full at gen_continuation_enter");
1447     }
1448     __ call(resolve);
1449     oop_maps->add_gc_map(__ pc() - start, map);
1450     __ post_call_nop();
1451 
1452     __ jmp(L_exit);
1453   }
1454 
1455   // compiled entry
1456   __ align(CodeEntryAlignment);
1457   compiled_entry_offset = __ pc() - start;
1458   __ enter();
1459 
1460   stack_slots = 2; // will be adjusted in setup
1461   OopMap* map = continuation_enter_setup(masm, stack_slots);
1462 
1463   // Frame is now completed as far as size and linkage.
1464   frame_complete = __ pc() - start;
1465 
1466   __ verify_oop(reg_cont_obj);
1467 
1468   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1469 
1470   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1471   __ testptr(reg_is_cont, reg_is_cont);
1472   __ jccb(Assembler::notZero, L_thaw);
1473 
1474   // --- call Continuation.enter(Continuation c, boolean isContinue)
1475 
1476   // Make sure the call is patchable
1477   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1478 
1479   // Emit stub for static call
1480   CodeBuffer* cbuf = masm->code_section()->outer();
1481   address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1482   if (stub == nullptr) {
1483     fatal("CodeCache is full at gen_continuation_enter");
1484   }
1485 
1486   // The call needs to be resolved. There's a special case for this in
1487   // SharedRuntime::find_callee_info_helper() which calls
1488   // LinkResolver::resolve_continuation_enter() which resolves the call to
1489   // Continuation.enter(Continuation c, boolean isContinue).
1490   __ call(resolve);
1491 
1492   oop_maps->add_gc_map(__ pc() - start, map);
1493   __ post_call_nop();
1494 
1495   __ jmpb(L_exit);
1496 
1497   // --- Thawing path
1498 
1499   __ bind(L_thaw);
1500 
1501   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1502 
1503   ContinuationEntry::_return_pc_offset = __ pc() - start;
1504   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1505   __ post_call_nop();
1506 
1507   // --- Normal exit (resolve/thawing)
1508 
1509   __ bind(L_exit);
1510 
1511   continuation_enter_cleanup(masm);
1512   __ pop(rbp);
1513   __ ret(0);
1514 
1515   // --- Exception handling path
1516 
1517   exception_offset = __ pc() - start;
1518 
1519   continuation_enter_cleanup(masm);
1520   __ pop(rbp);
1521 
1522   __ movptr(c_rarg0, r15_thread);
1523   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1524 
1525   // rax still holds the original exception oop, save it before the call
1526   __ push(rax);
1527 
1528   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1529   __ movptr(rbx, rax);
1530 
1531   // Continue at exception handler:
1532   //   rax: exception oop
1533   //   rbx: exception handler
1534   //   rdx: exception pc
1535   __ pop(rax);
1536   __ verify_oop(rax);
1537   __ pop(rdx);
1538   __ jmp(rbx);
1539 }
1540 
1541 static void gen_continuation_yield(MacroAssembler* masm,
1542                                    const VMRegPair* regs,
1543                                    OopMapSet* oop_maps,
1544                                    int& frame_complete,
1545                                    int& stack_slots,
1546                                    int& compiled_entry_offset) {
1547   enum layout {
1548     rbp_off,
1549     rbpH_off,
1550     return_off,
1551     return_off2,
1552     framesize // inclusive of return address
1553   };
1554   stack_slots = framesize /  VMRegImpl::slots_per_word;
1555   assert(stack_slots == 2, "recheck layout");
1556 
1557   address start = __ pc();
1558   compiled_entry_offset = __ pc() - start;
1559   __ enter();
1560   address the_pc = __ pc();
1561 
1562   frame_complete = the_pc - start;
1563 
1564   // This nop must be exactly at the PC we push into the frame info.
1565   // We use this nop for fast CodeBlob lookup, associate the OopMap
1566   // with it right away.
1567   __ post_call_nop();
1568   OopMap* map = new OopMap(framesize, 1);
1569   oop_maps->add_gc_map(frame_complete, map);
1570 
1571   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1572   __ movptr(c_rarg0, r15_thread);
1573   __ movptr(c_rarg1, rsp);
1574   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1575   __ reset_last_Java_frame(true);
1576 
1577   Label L_pinned;
1578 
1579   __ testptr(rax, rax);
1580   __ jcc(Assembler::notZero, L_pinned);
1581 
1582   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1583   continuation_enter_cleanup(masm);
1584   __ pop(rbp);
1585   __ ret(0);
1586 
1587   __ bind(L_pinned);
1588 
1589   // Pinned, return to caller
1590 
1591   // handle pending exception thrown by freeze
1592   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1593   Label ok;
1594   __ jcc(Assembler::equal, ok);
1595   __ leave();
1596   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1597   __ bind(ok);
1598 
1599   __ leave();
1600   __ ret(0);
1601 }
1602 
1603 static void gen_special_dispatch(MacroAssembler* masm,
1604                                  const methodHandle& method,
1605                                  const BasicType* sig_bt,
1606                                  const VMRegPair* regs) {
1607   verify_oop_args(masm, method, sig_bt, regs);
1608   vmIntrinsics::ID iid = method->intrinsic_id();
1609 
1610   // Now write the args into the outgoing interpreter space
1611   bool     has_receiver   = false;
1612   Register receiver_reg   = noreg;
1613   int      member_arg_pos = -1;
1614   Register member_reg     = noreg;
1615   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1616   if (ref_kind != 0) {
1617     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1618     member_reg = rbx;  // known to be free at this point
1619     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1620   } else if (iid == vmIntrinsics::_invokeBasic) {
1621     has_receiver = true;
1622   } else if (iid == vmIntrinsics::_linkToNative) {
1623     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1624     member_reg = rbx;  // known to be free at this point
1625   } else {
1626     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1627   }
1628 
1629   if (member_reg != noreg) {
1630     // Load the member_arg into register, if necessary.
1631     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1632     VMReg r = regs[member_arg_pos].first();
1633     if (r->is_stack()) {
1634       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1635     } else {
1636       // no data motion is needed
1637       member_reg = r->as_Register();
1638     }
1639   }
1640 
1641   if (has_receiver) {
1642     // Make sure the receiver is loaded into a register.
1643     assert(method->size_of_parameters() > 0, "oob");
1644     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1645     VMReg r = regs[0].first();
1646     assert(r->is_valid(), "bad receiver arg");
1647     if (r->is_stack()) {
1648       // Porting note:  This assumes that compiled calling conventions always
1649       // pass the receiver oop in a register.  If this is not true on some
1650       // platform, pick a temp and load the receiver from stack.
1651       fatal("receiver always in a register");
1652       receiver_reg = j_rarg0;  // known to be free at this point
1653       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1654     } else {
1655       // no data motion is needed
1656       receiver_reg = r->as_Register();
1657     }
1658   }
1659 
1660   // Figure out which address we are really jumping to:
1661   MethodHandles::generate_method_handle_dispatch(masm, iid,
1662                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1663 }
1664 
1665 // ---------------------------------------------------------------------------
1666 // Generate a native wrapper for a given method.  The method takes arguments
1667 // in the Java compiled code convention, marshals them to the native
1668 // convention (handlizes oops, etc), transitions to native, makes the call,
1669 // returns to java state (possibly blocking), unhandlizes any result and
1670 // returns.
1671 //
1672 // Critical native functions are a shorthand for the use of
1673 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1674 // functions.  The wrapper is expected to unpack the arguments before
1675 // passing them to the callee. Critical native functions leave the state _in_Java,
1676 // since they cannot stop for GC.
1677 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1678 // block and the check for pending exceptions it's impossible for them
1679 // to be thrown.
1680 //
1681 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1682                                                 const methodHandle& method,
1683                                                 int compile_id,
1684                                                 BasicType* in_sig_bt,
1685                                                 VMRegPair* in_regs,
1686                                                 BasicType ret_type) {
1687   if (method->is_continuation_native_intrinsic()) {
1688     int exception_offset = -1;
1689     OopMapSet* oop_maps = new OopMapSet();
1690     int frame_complete = -1;
1691     int stack_slots = -1;
1692     int interpreted_entry_offset = -1;
1693     int vep_offset = -1;
1694     if (method->is_continuation_enter_intrinsic()) {
1695       gen_continuation_enter(masm,
1696                              in_regs,
1697                              exception_offset,
1698                              oop_maps,
1699                              frame_complete,
1700                              stack_slots,
1701                              interpreted_entry_offset,
1702                              vep_offset);
1703     } else if (method->is_continuation_yield_intrinsic()) {
1704       gen_continuation_yield(masm,
1705                              in_regs,
1706                              oop_maps,
1707                              frame_complete,
1708                              stack_slots,
1709                              vep_offset);
1710     } else {
1711       guarantee(false, "Unknown Continuation native intrinsic");
1712     }
1713 
1714 #ifdef ASSERT
1715     if (method->is_continuation_enter_intrinsic()) {
1716       assert(interpreted_entry_offset != -1, "Must be set");
1717       assert(exception_offset != -1,         "Must be set");
1718     } else {
1719       assert(interpreted_entry_offset == -1, "Must be unset");
1720       assert(exception_offset == -1,         "Must be unset");
1721     }
1722     assert(frame_complete != -1,    "Must be set");
1723     assert(stack_slots != -1,       "Must be set");
1724     assert(vep_offset != -1,        "Must be set");
1725 #endif
1726 
1727     __ flush();
1728     nmethod* nm = nmethod::new_native_nmethod(method,
1729                                               compile_id,
1730                                               masm->code(),
1731                                               vep_offset,
1732                                               frame_complete,
1733                                               stack_slots,
1734                                               in_ByteSize(-1),
1735                                               in_ByteSize(-1),
1736                                               oop_maps,
1737                                               exception_offset);
1738     if (method->is_continuation_enter_intrinsic()) {
1739       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1740     } else if (method->is_continuation_yield_intrinsic()) {
1741       _cont_doYield_stub = nm;
1742     }
1743     return nm;
1744   }
1745 
1746   if (method->is_method_handle_intrinsic()) {
1747     vmIntrinsics::ID iid = method->intrinsic_id();
1748     intptr_t start = (intptr_t)__ pc();
1749     int vep_offset = ((intptr_t)__ pc()) - start;
1750     gen_special_dispatch(masm,
1751                          method,
1752                          in_sig_bt,
1753                          in_regs);
1754     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1755     __ flush();
1756     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1757     return nmethod::new_native_nmethod(method,
1758                                        compile_id,
1759                                        masm->code(),
1760                                        vep_offset,
1761                                        frame_complete,
1762                                        stack_slots / VMRegImpl::slots_per_word,
1763                                        in_ByteSize(-1),
1764                                        in_ByteSize(-1),
1765                                        (OopMapSet*)NULL);
1766   }
1767   address native_func = method->native_function();
1768   assert(native_func != NULL, "must have function");
1769 
1770   // An OopMap for lock (and class if static)
1771   OopMapSet *oop_maps = new OopMapSet();
1772   intptr_t start = (intptr_t)__ pc();
1773 
1774   // We have received a description of where all the java arg are located
1775   // on entry to the wrapper. We need to convert these args to where
1776   // the jni function will expect them. To figure out where they go
1777   // we convert the java signature to a C signature by inserting
1778   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1779 
1780   const int total_in_args = method->size_of_parameters();
1781   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1782 
1783   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1784   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1785   BasicType* in_elem_bt = NULL;
1786 
1787   int argc = 0;
1788   out_sig_bt[argc++] = T_ADDRESS;
1789   if (method->is_static()) {
1790     out_sig_bt[argc++] = T_OBJECT;
1791   }
1792 
1793   for (int i = 0; i < total_in_args ; i++ ) {
1794     out_sig_bt[argc++] = in_sig_bt[i];
1795   }
1796 
1797   // Now figure out where the args must be stored and how much stack space
1798   // they require.
1799   int out_arg_slots;
1800   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1801 
1802   // Compute framesize for the wrapper.  We need to handlize all oops in
1803   // incoming registers
1804 
1805   // Calculate the total number of stack slots we will need.
1806 
1807   // First count the abi requirement plus all of the outgoing args
1808   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1809 
1810   // Now the space for the inbound oop handle area
1811   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1812 
1813   int oop_handle_offset = stack_slots;
1814   stack_slots += total_save_slots;
1815 
1816   // Now any space we need for handlizing a klass if static method
1817 
1818   int klass_slot_offset = 0;
1819   int klass_offset = -1;
1820   int lock_slot_offset = 0;
1821   bool is_static = false;
1822 
1823   if (method->is_static()) {
1824     klass_slot_offset = stack_slots;
1825     stack_slots += VMRegImpl::slots_per_word;
1826     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1827     is_static = true;
1828   }
1829 
1830   // Plus a lock if needed
1831 
1832   if (method->is_synchronized()) {
1833     lock_slot_offset = stack_slots;
1834     stack_slots += VMRegImpl::slots_per_word;
1835   }
1836 
1837   // Now a place (+2) to save return values or temp during shuffling
1838   // + 4 for return address (which we own) and saved rbp
1839   stack_slots += 6;
1840 
1841   // Ok The space we have allocated will look like:
1842   //
1843   //
1844   // FP-> |                     |
1845   //      |---------------------|
1846   //      | 2 slots for moves   |
1847   //      |---------------------|
1848   //      | lock box (if sync)  |
1849   //      |---------------------| <- lock_slot_offset
1850   //      | klass (if static)   |
1851   //      |---------------------| <- klass_slot_offset
1852   //      | oopHandle area      |
1853   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1854   //      | outbound memory     |
1855   //      | based arguments     |
1856   //      |                     |
1857   //      |---------------------|
1858   //      |                     |
1859   // SP-> | out_preserved_slots |
1860   //
1861   //
1862 
1863 
1864   // Now compute actual number of stack words we need rounding to make
1865   // stack properly aligned.
1866   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1867 
1868   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1869 
1870   // First thing make an ic check to see if we should even be here
1871 
1872   // We are free to use all registers as temps without saving them and
1873   // restoring them except rbp. rbp is the only callee save register
1874   // as far as the interpreter and the compiler(s) are concerned.
1875 
1876 
1877   const Register ic_reg = rax;
1878   const Register receiver = j_rarg0;
1879 
1880   Label hit;
1881   Label exception_pending;
1882 
1883   assert_different_registers(ic_reg, receiver, rscratch1, rscratch2);
1884   __ verify_oop(receiver);
1885   __ load_klass(rscratch1, receiver, rscratch2);
1886   __ cmpq(ic_reg, rscratch1);
1887   __ jcc(Assembler::equal, hit);
1888 
1889   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1890 
1891   // Verified entry point must be aligned
1892   __ align(8);
1893 
1894   __ bind(hit);
1895 
1896   int vep_offset = ((intptr_t)__ pc()) - start;
1897 
1898   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1899     Label L_skip_barrier;
1900     Register klass = r10;
1901     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1902     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1903 
1904     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1905 
1906     __ bind(L_skip_barrier);
1907   }
1908 
1909 #ifdef COMPILER1
1910   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1911   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1912     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1913   }
1914 #endif // COMPILER1
1915 
1916   // The instruction at the verified entry point must be 5 bytes or longer
1917   // because it can be patched on the fly by make_non_entrant. The stack bang
1918   // instruction fits that requirement.
1919 
1920   // Generate stack overflow check
1921   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1922 
1923   // Generate a new frame for the wrapper.
1924   __ enter();
1925   // -2 because return address is already present and so is saved rbp
1926   __ subptr(rsp, stack_size - 2*wordSize);
1927 
1928   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1929   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1930   bs->nmethod_entry_barrier(masm, NULL /* slow_path */, NULL /* continuation */);
1931 
1932   // Frame is now completed as far as size and linkage.
1933   int frame_complete = ((intptr_t)__ pc()) - start;
1934 
1935     if (UseRTMLocking) {
1936       // Abort RTM transaction before calling JNI
1937       // because critical section will be large and will be
1938       // aborted anyway. Also nmethod could be deoptimized.
1939       __ xabort(0);
1940     }
1941 
1942 #ifdef ASSERT
1943   __ check_stack_alignment(rsp, "improperly aligned stack");
1944 #endif /* ASSERT */
1945 
1946 
1947   // We use r14 as the oop handle for the receiver/klass
1948   // It is callee save so it survives the call to native
1949 
1950   const Register oop_handle_reg = r14;
1951 
1952   //
1953   // We immediately shuffle the arguments so that any vm call we have to
1954   // make from here on out (sync slow path, jvmti, etc.) we will have
1955   // captured the oops from our caller and have a valid oopMap for
1956   // them.
1957 
1958   // -----------------
1959   // The Grand Shuffle
1960 
1961   // The Java calling convention is either equal (linux) or denser (win64) than the
1962   // c calling convention. However the because of the jni_env argument the c calling
1963   // convention always has at least one more (and two for static) arguments than Java.
1964   // Therefore if we move the args from java -> c backwards then we will never have
1965   // a register->register conflict and we don't have to build a dependency graph
1966   // and figure out how to break any cycles.
1967   //
1968 
1969   // Record esp-based slot for receiver on stack for non-static methods
1970   int receiver_offset = -1;
1971 
1972   // This is a trick. We double the stack slots so we can claim
1973   // the oops in the caller's frame. Since we are sure to have
1974   // more args than the caller doubling is enough to make
1975   // sure we can capture all the incoming oop args from the
1976   // caller.
1977   //
1978   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1979 
1980   // Mark location of rbp (someday)
1981   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1982 
1983   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1984   // All inbound args are referenced based on rbp and all outbound args via rsp.
1985 
1986 
1987 #ifdef ASSERT
1988   bool reg_destroyed[Register::number_of_registers];
1989   bool freg_destroyed[XMMRegister::number_of_registers];
1990   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1991     reg_destroyed[r] = false;
1992   }
1993   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1994     freg_destroyed[f] = false;
1995   }
1996 
1997 #endif /* ASSERT */
1998 
1999   // For JNI natives the incoming and outgoing registers are offset upwards.
2000   GrowableArray<int> arg_order(2 * total_in_args);
2001 
2002   VMRegPair tmp_vmreg;
2003   tmp_vmreg.set2(rbx->as_VMReg());
2004 
2005   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2006     arg_order.push(i);
2007     arg_order.push(c_arg);
2008   }
2009 
2010   int temploc = -1;
2011   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2012     int i = arg_order.at(ai);
2013     int c_arg = arg_order.at(ai + 1);
2014     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2015 #ifdef ASSERT
2016     if (in_regs[i].first()->is_Register()) {
2017       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2018     } else if (in_regs[i].first()->is_XMMRegister()) {
2019       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2020     }
2021     if (out_regs[c_arg].first()->is_Register()) {
2022       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2023     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2024       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2025     }
2026 #endif /* ASSERT */
2027     switch (in_sig_bt[i]) {
2028       case T_ARRAY:
2029       case T_OBJECT:
2030         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2031                     ((i == 0) && (!is_static)),
2032                     &receiver_offset);
2033         break;
2034       case T_VOID:
2035         break;
2036 
2037       case T_FLOAT:
2038         __ float_move(in_regs[i], out_regs[c_arg]);
2039           break;
2040 
2041       case T_DOUBLE:
2042         assert( i + 1 < total_in_args &&
2043                 in_sig_bt[i + 1] == T_VOID &&
2044                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2045         __ double_move(in_regs[i], out_regs[c_arg]);
2046         break;
2047 
2048       case T_LONG :
2049         __ long_move(in_regs[i], out_regs[c_arg]);
2050         break;
2051 
2052       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2053 
2054       default:
2055         __ move32_64(in_regs[i], out_regs[c_arg]);
2056     }
2057   }
2058 
2059   int c_arg;
2060 
2061   // Pre-load a static method's oop into r14.  Used both by locking code and
2062   // the normal JNI call code.
2063   // point c_arg at the first arg that is already loaded in case we
2064   // need to spill before we call out
2065   c_arg = total_c_args - total_in_args;
2066 
2067   if (method->is_static()) {
2068 
2069     //  load oop into a register
2070     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2071 
2072     // Now handlize the static class mirror it's known not-null.
2073     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2074     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2075 
2076     // Now get the handle
2077     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2078     // store the klass handle as second argument
2079     __ movptr(c_rarg1, oop_handle_reg);
2080     // and protect the arg if we must spill
2081     c_arg--;
2082   }
2083 
2084   // Change state to native (we save the return address in the thread, since it might not
2085   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2086   // points into the right code segment. It does not have to be the correct return pc.
2087   // We use the same pc/oopMap repeatedly when we call out
2088 
2089   intptr_t the_pc = (intptr_t) __ pc();
2090   oop_maps->add_gc_map(the_pc - start, map);
2091 
2092   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2093 
2094 
2095   // We have all of the arguments setup at this point. We must not touch any register
2096   // argument registers at this point (what if we save/restore them there are no oop?
2097 
2098   {
2099     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2100     // protect the args we've loaded
2101     save_args(masm, total_c_args, c_arg, out_regs);
2102     __ mov_metadata(c_rarg1, method());
2103     __ call_VM_leaf(
2104       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2105       r15_thread, c_rarg1);
2106     restore_args(masm, total_c_args, c_arg, out_regs);
2107   }
2108 
2109   // RedefineClasses() tracing support for obsolete method entry
2110   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2111     // protect the args we've loaded
2112     save_args(masm, total_c_args, c_arg, out_regs);
2113     __ mov_metadata(c_rarg1, method());
2114     __ call_VM_leaf(
2115       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2116       r15_thread, c_rarg1);
2117     restore_args(masm, total_c_args, c_arg, out_regs);
2118   }
2119 
2120   // Lock a synchronized method
2121 
2122   // Register definitions used by locking and unlocking
2123 
2124   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2125   const Register obj_reg  = rbx;  // Will contain the oop
2126   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2127   const Register old_hdr  = r13;  // value of old header at unlock time
2128 
2129   Label slow_path_lock;
2130   Label lock_done;
2131 
2132   if (method->is_synchronized()) {
2133     Label count_mon;
2134 
2135     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2136 
2137     // Get the handle (the 2nd argument)
2138     __ mov(oop_handle_reg, c_rarg1);
2139 
2140     // Get address of the box
2141 
2142     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2143 
2144     // Load the oop from the handle
2145     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2146 
2147     if (!UseHeavyMonitors) {
2148       if (UseFastLocking) {
2149         // Load object header
2150         __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2151         __ fast_lock_impl(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2152       } else {
2153         // Load immediate 1 into swap_reg %rax
2154         __ movl(swap_reg, 1);
2155 
2156         // Load (object->mark() | 1) into swap_reg %rax
2157         __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2158 
2159         // Save (object->mark() | 1) into BasicLock's displaced header
2160         __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2161 
2162         // src -> dest iff dest == rax else rax <- dest
2163         __ lock();
2164         __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2165         __ jcc(Assembler::equal, count_mon);
2166 
2167         // Hmm should this move to the slow path code area???
2168 
2169         // Test if the oopMark is an obvious stack pointer, i.e.,
2170         //  1) (mark & 3) == 0, and
2171         //  2) rsp <= mark < mark + os::pagesize()
2172         // These 3 tests can be done by evaluating the following
2173         // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2174         // assuming both stack pointer and pagesize have their
2175         // least significant 2 bits clear.
2176         // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2177 
2178         __ subptr(swap_reg, rsp);
2179         __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2180 
2181         // Save the test result, for recursive case, the result is zero
2182         __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2183         __ jcc(Assembler::notEqual, slow_path_lock);
2184       }
2185     } else {
2186       __ jmp(slow_path_lock);
2187     }
2188     __ bind(count_mon);
2189     __ inc_held_monitor_count();
2190 
2191     // Slow path will re-enter here
2192     __ bind(lock_done);
2193   }
2194 
2195   // Finally just about ready to make the JNI call
2196 
2197   // get JNIEnv* which is first argument to native
2198   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2199 
2200   // Now set thread in native
2201   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2202 
2203   __ call(RuntimeAddress(native_func));
2204 
2205   // Verify or restore cpu control state after JNI call
2206   __ restore_cpu_control_state_after_jni(rscratch1);
2207 
2208   // Unpack native results.
2209   switch (ret_type) {
2210   case T_BOOLEAN: __ c2bool(rax);            break;
2211   case T_CHAR   : __ movzwl(rax, rax);      break;
2212   case T_BYTE   : __ sign_extend_byte (rax); break;
2213   case T_SHORT  : __ sign_extend_short(rax); break;
2214   case T_INT    : /* nothing to do */        break;
2215   case T_DOUBLE :
2216   case T_FLOAT  :
2217     // Result is in xmm0 we'll save as needed
2218     break;
2219   case T_ARRAY:                 // Really a handle
2220   case T_OBJECT:                // Really a handle
2221       break; // can't de-handlize until after safepoint check
2222   case T_VOID: break;
2223   case T_LONG: break;
2224   default       : ShouldNotReachHere();
2225   }
2226 
2227   Label after_transition;
2228 
2229   // Switch thread to "native transition" state before reading the synchronization state.
2230   // This additional state is necessary because reading and testing the synchronization
2231   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2232   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2233   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2234   //     Thread A is resumed to finish this native method, but doesn't block here since it
2235   //     didn't see any synchronization is progress, and escapes.
2236   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2237 
2238   // Force this write out before the read below
2239   if (!UseSystemMemoryBarrier) {
2240     __ membar(Assembler::Membar_mask_bits(
2241               Assembler::LoadLoad | Assembler::LoadStore |
2242               Assembler::StoreLoad | Assembler::StoreStore));
2243   }
2244 
2245   // check for safepoint operation in progress and/or pending suspend requests
2246   {
2247     Label Continue;
2248     Label slow_path;
2249 
2250     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2251 
2252     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2253     __ jcc(Assembler::equal, Continue);
2254     __ bind(slow_path);
2255 
2256     // Don't use call_VM as it will see a possible pending exception and forward it
2257     // and never return here preventing us from clearing _last_native_pc down below.
2258     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2259     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2260     // by hand.
2261     //
2262     __ vzeroupper();
2263     save_native_result(masm, ret_type, stack_slots);
2264     __ mov(c_rarg0, r15_thread);
2265     __ mov(r12, rsp); // remember sp
2266     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2267     __ andptr(rsp, -16); // align stack as required by ABI
2268     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2269     __ mov(rsp, r12); // restore sp
2270     __ reinit_heapbase();
2271     // Restore any method result value
2272     restore_native_result(masm, ret_type, stack_slots);
2273     __ bind(Continue);
2274   }
2275 
2276   // change thread state
2277   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2278   __ bind(after_transition);
2279 
2280   Label reguard;
2281   Label reguard_done;
2282   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2283   __ jcc(Assembler::equal, reguard);
2284   __ bind(reguard_done);
2285 
2286   // native result if any is live
2287 
2288   // Unlock
2289   Label slow_path_unlock;
2290   Label unlock_done;
2291   if (method->is_synchronized()) {
2292 
2293     Label fast_done;
2294 
2295     // Get locked oop from the handle we passed to jni
2296     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2297 
2298     if (!UseHeavyMonitors && !UseFastLocking) {
2299       Label not_recur;
2300       // Simple recursive lock?
2301       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2302       __ jcc(Assembler::notEqual, not_recur);
2303       __ dec_held_monitor_count();
2304       __ jmpb(fast_done);
2305       __ bind(not_recur);
2306     }
2307 
2308     // Must save rax if it is live now because cmpxchg must use it
2309     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2310       save_native_result(masm, ret_type, stack_slots);
2311     }
2312 
2313     if (!UseHeavyMonitors) {
2314       if (UseFastLocking) {
2315         __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2316         __ andptr(swap_reg, ~(int32_t)markWord::lock_mask_in_place);
2317         __ fast_unlock_impl(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2318       } else {
2319         // get address of the stack lock
2320         __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2321         //  get old displaced header
2322         __ movptr(old_hdr, Address(rax, 0));
2323 
2324         // Atomic swap old header if oop still contains the stack lock
2325         __ lock();
2326         __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2327         __ jcc(Assembler::notEqual, slow_path_unlock);
2328       }
2329       __ dec_held_monitor_count();
2330     } else {
2331       __ jmp(slow_path_unlock);
2332     }
2333 
2334     // slow path re-enters here
2335     __ bind(unlock_done);
2336     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2337       restore_native_result(masm, ret_type, stack_slots);
2338     }
2339 
2340     __ bind(fast_done);
2341   }
2342   {
2343     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2344     save_native_result(masm, ret_type, stack_slots);
2345     __ mov_metadata(c_rarg1, method());
2346     __ call_VM_leaf(
2347          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2348          r15_thread, c_rarg1);
2349     restore_native_result(masm, ret_type, stack_slots);
2350   }
2351 
2352   __ reset_last_Java_frame(false);
2353 
2354   // Unbox oop result, e.g. JNIHandles::resolve value.
2355   if (is_reference_type(ret_type)) {
2356     __ resolve_jobject(rax /* value */,
2357                        r15_thread /* thread */,
2358                        rcx /* tmp */);
2359   }
2360 
2361   if (CheckJNICalls) {
2362     // clear_pending_jni_exception_check
2363     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2364   }
2365 
2366   // reset handle block
2367   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2368   __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), NULL_WORD);
2369 
2370   // pop our frame
2371 
2372   __ leave();
2373 
2374   // Any exception pending?
2375   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2376   __ jcc(Assembler::notEqual, exception_pending);
2377 
2378   // Return
2379 
2380   __ ret(0);
2381 
2382   // Unexpected paths are out of line and go here
2383 
2384   // forward the exception
2385   __ bind(exception_pending);
2386 
2387   // and forward the exception
2388   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2389 
2390   // Slow path locking & unlocking
2391   if (method->is_synchronized()) {
2392 
2393     // BEGIN Slow path lock
2394     __ bind(slow_path_lock);
2395 
2396     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2397     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2398 
2399     // protect the args we've loaded
2400     save_args(masm, total_c_args, c_arg, out_regs);
2401 
2402     __ mov(c_rarg0, obj_reg);
2403     __ mov(c_rarg1, lock_reg);
2404     __ mov(c_rarg2, r15_thread);
2405 
2406     // Not a leaf but we have last_Java_frame setup as we want
2407     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2408     restore_args(masm, total_c_args, c_arg, out_regs);
2409 
2410 #ifdef ASSERT
2411     { Label L;
2412     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2413     __ jcc(Assembler::equal, L);
2414     __ stop("no pending exception allowed on exit from monitorenter");
2415     __ bind(L);
2416     }
2417 #endif
2418     __ jmp(lock_done);
2419 
2420     // END Slow path lock
2421 
2422     // BEGIN Slow path unlock
2423     __ bind(slow_path_unlock);
2424 
2425     // If we haven't already saved the native result we must save it now as xmm registers
2426     // are still exposed.
2427     __ vzeroupper();
2428     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2429       save_native_result(masm, ret_type, stack_slots);
2430     }
2431 
2432     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2433 
2434     __ mov(c_rarg0, obj_reg);
2435     __ mov(c_rarg2, r15_thread);
2436     __ mov(r12, rsp); // remember sp
2437     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2438     __ andptr(rsp, -16); // align stack as required by ABI
2439 
2440     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2441     // NOTE that obj_reg == rbx currently
2442     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2443     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2444 
2445     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2446     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2447     __ mov(rsp, r12); // restore sp
2448     __ reinit_heapbase();
2449 #ifdef ASSERT
2450     {
2451       Label L;
2452       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2453       __ jcc(Assembler::equal, L);
2454       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2455       __ bind(L);
2456     }
2457 #endif /* ASSERT */
2458 
2459     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2460 
2461     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2462       restore_native_result(masm, ret_type, stack_slots);
2463     }
2464     __ jmp(unlock_done);
2465 
2466     // END Slow path unlock
2467 
2468   } // synchronized
2469 
2470   // SLOW PATH Reguard the stack if needed
2471 
2472   __ bind(reguard);
2473   __ vzeroupper();
2474   save_native_result(masm, ret_type, stack_slots);
2475   __ mov(r12, rsp); // remember sp
2476   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2477   __ andptr(rsp, -16); // align stack as required by ABI
2478   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2479   __ mov(rsp, r12); // restore sp
2480   __ reinit_heapbase();
2481   restore_native_result(masm, ret_type, stack_slots);
2482   // and continue
2483   __ jmp(reguard_done);
2484 
2485 
2486 
2487   __ flush();
2488 
2489   nmethod *nm = nmethod::new_native_nmethod(method,
2490                                             compile_id,
2491                                             masm->code(),
2492                                             vep_offset,
2493                                             frame_complete,
2494                                             stack_slots / VMRegImpl::slots_per_word,
2495                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2496                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2497                                             oop_maps);
2498 
2499   return nm;
2500 }
2501 
2502 // this function returns the adjust size (in number of words) to a c2i adapter
2503 // activation for use during deoptimization
2504 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2505   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2506 }
2507 
2508 
2509 uint SharedRuntime::out_preserve_stack_slots() {
2510   return 0;
2511 }
2512 
2513 
2514 // Number of stack slots between incoming argument block and the start of
2515 // a new frame.  The PROLOG must add this many slots to the stack.  The
2516 // EPILOG must remove this many slots.  amd64 needs two slots for
2517 // return address.
2518 uint SharedRuntime::in_preserve_stack_slots() {
2519   return 4 + 2 * VerifyStackAtCalls;
2520 }
2521 
2522 //------------------------------generate_deopt_blob----------------------------
2523 void SharedRuntime::generate_deopt_blob() {
2524   // Allocate space for the code
2525   ResourceMark rm;
2526   // Setup code generation tools
2527   int pad = 0;
2528   if (UseAVX > 2) {
2529     pad += 1024;
2530   }
2531 #if INCLUDE_JVMCI
2532   if (EnableJVMCI) {
2533     pad += 512; // Increase the buffer size when compiling for JVMCI
2534   }
2535 #endif
2536   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2537   MacroAssembler* masm = new MacroAssembler(&buffer);
2538   int frame_size_in_words;
2539   OopMap* map = NULL;
2540   OopMapSet *oop_maps = new OopMapSet();
2541 
2542   // -------------
2543   // This code enters when returning to a de-optimized nmethod.  A return
2544   // address has been pushed on the stack, and return values are in
2545   // registers.
2546   // If we are doing a normal deopt then we were called from the patched
2547   // nmethod from the point we returned to the nmethod. So the return
2548   // address on the stack is wrong by NativeCall::instruction_size
2549   // We will adjust the value so it looks like we have the original return
2550   // address on the stack (like when we eagerly deoptimized).
2551   // In the case of an exception pending when deoptimizing, we enter
2552   // with a return address on the stack that points after the call we patched
2553   // into the exception handler. We have the following register state from,
2554   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2555   //    rax: exception oop
2556   //    rbx: exception handler
2557   //    rdx: throwing pc
2558   // So in this case we simply jam rdx into the useless return address and
2559   // the stack looks just like we want.
2560   //
2561   // At this point we need to de-opt.  We save the argument return
2562   // registers.  We call the first C routine, fetch_unroll_info().  This
2563   // routine captures the return values and returns a structure which
2564   // describes the current frame size and the sizes of all replacement frames.
2565   // The current frame is compiled code and may contain many inlined
2566   // functions, each with their own JVM state.  We pop the current frame, then
2567   // push all the new frames.  Then we call the C routine unpack_frames() to
2568   // populate these frames.  Finally unpack_frames() returns us the new target
2569   // address.  Notice that callee-save registers are BLOWN here; they have
2570   // already been captured in the vframeArray at the time the return PC was
2571   // patched.
2572   address start = __ pc();
2573   Label cont;
2574 
2575   // Prolog for non exception case!
2576 
2577   // Save everything in sight.
2578   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2579 
2580   // Normal deoptimization.  Save exec mode for unpack_frames.
2581   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2582   __ jmp(cont);
2583 
2584   int reexecute_offset = __ pc() - start;
2585 #if INCLUDE_JVMCI && !defined(COMPILER1)
2586   if (EnableJVMCI && UseJVMCICompiler) {
2587     // JVMCI does not use this kind of deoptimization
2588     __ should_not_reach_here();
2589   }
2590 #endif
2591 
2592   // Reexecute case
2593   // return address is the pc describes what bci to do re-execute at
2594 
2595   // No need to update map as each call to save_live_registers will produce identical oopmap
2596   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2597 
2598   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2599   __ jmp(cont);
2600 
2601 #if INCLUDE_JVMCI
2602   Label after_fetch_unroll_info_call;
2603   int implicit_exception_uncommon_trap_offset = 0;
2604   int uncommon_trap_offset = 0;
2605 
2606   if (EnableJVMCI) {
2607     implicit_exception_uncommon_trap_offset = __ pc() - start;
2608 
2609     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2610     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2611 
2612     uncommon_trap_offset = __ pc() - start;
2613 
2614     // Save everything in sight.
2615     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2616     // fetch_unroll_info needs to call last_java_frame()
2617     __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
2618 
2619     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2620     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2621 
2622     __ movl(r14, Deoptimization::Unpack_reexecute);
2623     __ mov(c_rarg0, r15_thread);
2624     __ movl(c_rarg2, r14); // exec mode
2625     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2626     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2627 
2628     __ reset_last_Java_frame(false);
2629 
2630     __ jmp(after_fetch_unroll_info_call);
2631   } // EnableJVMCI
2632 #endif // INCLUDE_JVMCI
2633 
2634   int exception_offset = __ pc() - start;
2635 
2636   // Prolog for exception case
2637 
2638   // all registers are dead at this entry point, except for rax, and
2639   // rdx which contain the exception oop and exception pc
2640   // respectively.  Set them in TLS and fall thru to the
2641   // unpack_with_exception_in_tls entry point.
2642 
2643   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2644   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2645 
2646   int exception_in_tls_offset = __ pc() - start;
2647 
2648   // new implementation because exception oop is now passed in JavaThread
2649 
2650   // Prolog for exception case
2651   // All registers must be preserved because they might be used by LinearScan
2652   // Exceptiop oop and throwing PC are passed in JavaThread
2653   // tos: stack at point of call to method that threw the exception (i.e. only
2654   // args are on the stack, no return address)
2655 
2656   // make room on stack for the return address
2657   // It will be patched later with the throwing pc. The correct value is not
2658   // available now because loading it from memory would destroy registers.
2659   __ push(0);
2660 
2661   // Save everything in sight.
2662   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2663 
2664   // Now it is safe to overwrite any register
2665 
2666   // Deopt during an exception.  Save exec mode for unpack_frames.
2667   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2668 
2669   // load throwing pc from JavaThread and patch it as the return address
2670   // of the current frame. Then clear the field in JavaThread
2671 
2672   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2673   __ movptr(Address(rbp, wordSize), rdx);
2674   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2675 
2676 #ifdef ASSERT
2677   // verify that there is really an exception oop in JavaThread
2678   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2679   __ verify_oop(rax);
2680 
2681   // verify that there is no pending exception
2682   Label no_pending_exception;
2683   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2684   __ testptr(rax, rax);
2685   __ jcc(Assembler::zero, no_pending_exception);
2686   __ stop("must not have pending exception here");
2687   __ bind(no_pending_exception);
2688 #endif
2689 
2690   __ bind(cont);
2691 
2692   // Call C code.  Need thread and this frame, but NOT official VM entry
2693   // crud.  We cannot block on this call, no GC can happen.
2694   //
2695   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2696 
2697   // fetch_unroll_info needs to call last_java_frame().
2698 
2699   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
2700 #ifdef ASSERT
2701   { Label L;
2702     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2703     __ jcc(Assembler::equal, L);
2704     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2705     __ bind(L);
2706   }
2707 #endif // ASSERT
2708   __ mov(c_rarg0, r15_thread);
2709   __ movl(c_rarg1, r14); // exec_mode
2710   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2711 
2712   // Need to have an oopmap that tells fetch_unroll_info where to
2713   // find any register it might need.
2714   oop_maps->add_gc_map(__ pc() - start, map);
2715 
2716   __ reset_last_Java_frame(false);
2717 
2718 #if INCLUDE_JVMCI
2719   if (EnableJVMCI) {
2720     __ bind(after_fetch_unroll_info_call);
2721   }
2722 #endif
2723 
2724   // Load UnrollBlock* into rdi
2725   __ mov(rdi, rax);
2726 
2727   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2728    Label noException;
2729   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2730   __ jcc(Assembler::notEqual, noException);
2731   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2732   // QQQ this is useless it was NULL above
2733   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2734   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2735   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2736 
2737   __ verify_oop(rax);
2738 
2739   // Overwrite the result registers with the exception results.
2740   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2741   // I think this is useless
2742   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2743 
2744   __ bind(noException);
2745 
2746   // Only register save data is on the stack.
2747   // Now restore the result registers.  Everything else is either dead
2748   // or captured in the vframeArray.
2749   RegisterSaver::restore_result_registers(masm);
2750 
2751   // All of the register save area has been popped of the stack. Only the
2752   // return address remains.
2753 
2754   // Pop all the frames we must move/replace.
2755   //
2756   // Frame picture (youngest to oldest)
2757   // 1: self-frame (no frame link)
2758   // 2: deopting frame  (no frame link)
2759   // 3: caller of deopting frame (could be compiled/interpreted).
2760   //
2761   // Note: by leaving the return address of self-frame on the stack
2762   // and using the size of frame 2 to adjust the stack
2763   // when we are done the return to frame 3 will still be on the stack.
2764 
2765   // Pop deoptimized frame
2766   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2767   __ addptr(rsp, rcx);
2768 
2769   // rsp should be pointing at the return address to the caller (3)
2770 
2771   // Pick up the initial fp we should save
2772   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2773   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2774 
2775 #ifdef ASSERT
2776   // Compilers generate code that bang the stack by as much as the
2777   // interpreter would need. So this stack banging should never
2778   // trigger a fault. Verify that it does not on non product builds.
2779   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2780   __ bang_stack_size(rbx, rcx);
2781 #endif
2782 
2783   // Load address of array of frame pcs into rcx
2784   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2785 
2786   // Trash the old pc
2787   __ addptr(rsp, wordSize);
2788 
2789   // Load address of array of frame sizes into rsi
2790   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2791 
2792   // Load counter into rdx
2793   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2794 
2795   // Now adjust the caller's stack to make up for the extra locals
2796   // but record the original sp so that we can save it in the skeletal interpreter
2797   // frame and the stack walking of interpreter_sender will get the unextended sp
2798   // value and not the "real" sp value.
2799 
2800   const Register sender_sp = r8;
2801 
2802   __ mov(sender_sp, rsp);
2803   __ movl(rbx, Address(rdi,
2804                        Deoptimization::UnrollBlock::
2805                        caller_adjustment_offset_in_bytes()));
2806   __ subptr(rsp, rbx);
2807 
2808   // Push interpreter frames in a loop
2809   Label loop;
2810   __ bind(loop);
2811   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2812   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2813   __ pushptr(Address(rcx, 0));          // Save return address
2814   __ enter();                           // Save old & set new ebp
2815   __ subptr(rsp, rbx);                  // Prolog
2816   // This value is corrected by layout_activation_impl
2817   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2818   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2819   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2820   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2821   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2822   __ decrementl(rdx);                   // Decrement counter
2823   __ jcc(Assembler::notZero, loop);
2824   __ pushptr(Address(rcx, 0));          // Save final return address
2825 
2826   // Re-push self-frame
2827   __ enter();                           // Save old & set new ebp
2828 
2829   // Allocate a full sized register save area.
2830   // Return address and rbp are in place, so we allocate two less words.
2831   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2832 
2833   // Restore frame locals after moving the frame
2834   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2835   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2836 
2837   // Call C code.  Need thread but NOT official VM entry
2838   // crud.  We cannot block on this call, no GC can happen.  Call should
2839   // restore return values to their stack-slots with the new SP.
2840   //
2841   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2842 
2843   // Use rbp because the frames look interpreted now
2844   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2845   // Don't need the precise return PC here, just precise enough to point into this code blob.
2846   address the_pc = __ pc();
2847   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2848 
2849   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2850   __ mov(c_rarg0, r15_thread);
2851   __ movl(c_rarg1, r14); // second arg: exec_mode
2852   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2853   // Revert SP alignment after call since we're going to do some SP relative addressing below
2854   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2855 
2856   // Set an oopmap for the call site
2857   // Use the same PC we used for the last java frame
2858   oop_maps->add_gc_map(the_pc - start,
2859                        new OopMap( frame_size_in_words, 0 ));
2860 
2861   // Clear fp AND pc
2862   __ reset_last_Java_frame(true);
2863 
2864   // Collect return values
2865   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2866   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2867   // I think this is useless (throwing pc?)
2868   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2869 
2870   // Pop self-frame.
2871   __ leave();                           // Epilog
2872 
2873   // Jump to interpreter
2874   __ ret(0);
2875 
2876   // Make sure all code is generated
2877   masm->flush();
2878 
2879   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2880   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2881 #if INCLUDE_JVMCI
2882   if (EnableJVMCI) {
2883     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2884     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2885   }
2886 #endif
2887 }
2888 
2889 #ifdef COMPILER2
2890 //------------------------------generate_uncommon_trap_blob--------------------
2891 void SharedRuntime::generate_uncommon_trap_blob() {
2892   // Allocate space for the code
2893   ResourceMark rm;
2894   // Setup code generation tools
2895   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2896   MacroAssembler* masm = new MacroAssembler(&buffer);
2897 
2898   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2899 
2900   address start = __ pc();
2901 
2902   if (UseRTMLocking) {
2903     // Abort RTM transaction before possible nmethod deoptimization.
2904     __ xabort(0);
2905   }
2906 
2907   // Push self-frame.  We get here with a return address on the
2908   // stack, so rsp is 8-byte aligned until we allocate our frame.
2909   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2910 
2911   // No callee saved registers. rbp is assumed implicitly saved
2912   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2913 
2914   // compiler left unloaded_class_index in j_rarg0 move to where the
2915   // runtime expects it.
2916   __ movl(c_rarg1, j_rarg0);
2917 
2918   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
2919 
2920   // Call C code.  Need thread but NOT official VM entry
2921   // crud.  We cannot block on this call, no GC can happen.  Call should
2922   // capture callee-saved registers as well as return values.
2923   // Thread is in rdi already.
2924   //
2925   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2926 
2927   __ mov(c_rarg0, r15_thread);
2928   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2929   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2930 
2931   // Set an oopmap for the call site
2932   OopMapSet* oop_maps = new OopMapSet();
2933   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2934 
2935   // location of rbp is known implicitly by the frame sender code
2936 
2937   oop_maps->add_gc_map(__ pc() - start, map);
2938 
2939   __ reset_last_Java_frame(false);
2940 
2941   // Load UnrollBlock* into rdi
2942   __ mov(rdi, rax);
2943 
2944 #ifdef ASSERT
2945   { Label L;
2946     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2947               Deoptimization::Unpack_uncommon_trap);
2948     __ jcc(Assembler::equal, L);
2949     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
2950     __ bind(L);
2951   }
2952 #endif
2953 
2954   // Pop all the frames we must move/replace.
2955   //
2956   // Frame picture (youngest to oldest)
2957   // 1: self-frame (no frame link)
2958   // 2: deopting frame  (no frame link)
2959   // 3: caller of deopting frame (could be compiled/interpreted).
2960 
2961   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2962   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2963 
2964   // Pop deoptimized frame (int)
2965   __ movl(rcx, Address(rdi,
2966                        Deoptimization::UnrollBlock::
2967                        size_of_deoptimized_frame_offset_in_bytes()));
2968   __ addptr(rsp, rcx);
2969 
2970   // rsp should be pointing at the return address to the caller (3)
2971 
2972   // Pick up the initial fp we should save
2973   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2974   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2975 
2976 #ifdef ASSERT
2977   // Compilers generate code that bang the stack by as much as the
2978   // interpreter would need. So this stack banging should never
2979   // trigger a fault. Verify that it does not on non product builds.
2980   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2981   __ bang_stack_size(rbx, rcx);
2982 #endif
2983 
2984   // Load address of array of frame pcs into rcx (address*)
2985   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2986 
2987   // Trash the return pc
2988   __ addptr(rsp, wordSize);
2989 
2990   // Load address of array of frame sizes into rsi (intptr_t*)
2991   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2992 
2993   // Counter
2994   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2995 
2996   // Now adjust the caller's stack to make up for the extra locals but
2997   // record the original sp so that we can save it in the skeletal
2998   // interpreter frame and the stack walking of interpreter_sender
2999   // will get the unextended sp value and not the "real" sp value.
3000 
3001   const Register sender_sp = r8;
3002 
3003   __ mov(sender_sp, rsp);
3004   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
3005   __ subptr(rsp, rbx);
3006 
3007   // Push interpreter frames in a loop
3008   Label loop;
3009   __ bind(loop);
3010   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3011   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3012   __ pushptr(Address(rcx, 0));     // Save return address
3013   __ enter();                      // Save old & set new rbp
3014   __ subptr(rsp, rbx);             // Prolog
3015   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3016             sender_sp);            // Make it walkable
3017   // This value is corrected by layout_activation_impl
3018   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3019   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3020   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3021   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3022   __ decrementl(rdx);              // Decrement counter
3023   __ jcc(Assembler::notZero, loop);
3024   __ pushptr(Address(rcx, 0));     // Save final return address
3025 
3026   // Re-push self-frame
3027   __ enter();                 // Save old & set new rbp
3028   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3029                               // Prolog
3030 
3031   // Use rbp because the frames look interpreted now
3032   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3033   // Don't need the precise return PC here, just precise enough to point into this code blob.
3034   address the_pc = __ pc();
3035   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3036 
3037   // Call C code.  Need thread but NOT official VM entry
3038   // crud.  We cannot block on this call, no GC can happen.  Call should
3039   // restore return values to their stack-slots with the new SP.
3040   // Thread is in rdi already.
3041   //
3042   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3043 
3044   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3045   __ mov(c_rarg0, r15_thread);
3046   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3047   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3048 
3049   // Set an oopmap for the call site
3050   // Use the same PC we used for the last java frame
3051   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3052 
3053   // Clear fp AND pc
3054   __ reset_last_Java_frame(true);
3055 
3056   // Pop self-frame.
3057   __ leave();                 // Epilog
3058 
3059   // Jump to interpreter
3060   __ ret(0);
3061 
3062   // Make sure all code is generated
3063   masm->flush();
3064 
3065   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3066                                                  SimpleRuntimeFrame::framesize >> 1);
3067 }
3068 #endif // COMPILER2
3069 
3070 //------------------------------generate_handler_blob------
3071 //
3072 // Generate a special Compile2Runtime blob that saves all registers,
3073 // and setup oopmap.
3074 //
3075 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3076   assert(StubRoutines::forward_exception_entry() != NULL,
3077          "must be generated before");
3078 
3079   ResourceMark rm;
3080   OopMapSet *oop_maps = new OopMapSet();
3081   OopMap* map;
3082 
3083   // Allocate space for the code.  Setup code generation tools.
3084   CodeBuffer buffer("handler_blob", 2048, 1024);
3085   MacroAssembler* masm = new MacroAssembler(&buffer);
3086 
3087   address start   = __ pc();
3088   address call_pc = NULL;
3089   int frame_size_in_words;
3090   bool cause_return = (poll_type == POLL_AT_RETURN);
3091   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3092 
3093   if (UseRTMLocking) {
3094     // Abort RTM transaction before calling runtime
3095     // because critical section will be large and will be
3096     // aborted anyway. Also nmethod could be deoptimized.
3097     __ xabort(0);
3098   }
3099 
3100   // Make room for return address (or push it again)
3101   if (!cause_return) {
3102     __ push(rbx);
3103   }
3104 
3105   // Save registers, fpu state, and flags
3106   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3107 
3108   // The following is basically a call_VM.  However, we need the precise
3109   // address of the call in order to generate an oopmap. Hence, we do all the
3110   // work ourselves.
3111 
3112   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3113 
3114   // The return address must always be correct so that frame constructor never
3115   // sees an invalid pc.
3116 
3117   if (!cause_return) {
3118     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3119     // Additionally, rbx is a callee saved register and we can look at it later to determine
3120     // if someone changed the return address for us!
3121     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3122     __ movptr(Address(rbp, wordSize), rbx);
3123   }
3124 
3125   // Do the call
3126   __ mov(c_rarg0, r15_thread);
3127   __ call(RuntimeAddress(call_ptr));
3128 
3129   // Set an oopmap for the call site.  This oopmap will map all
3130   // oop-registers and debug-info registers as callee-saved.  This
3131   // will allow deoptimization at this safepoint to find all possible
3132   // debug-info recordings, as well as let GC find all oops.
3133 
3134   oop_maps->add_gc_map( __ pc() - start, map);
3135 
3136   Label noException;
3137 
3138   __ reset_last_Java_frame(false);
3139 
3140   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3141   __ jcc(Assembler::equal, noException);
3142 
3143   // Exception pending
3144 
3145   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3146 
3147   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3148 
3149   // No exception case
3150   __ bind(noException);
3151 
3152   Label no_adjust;
3153 #ifdef ASSERT
3154   Label bail;
3155 #endif
3156   if (!cause_return) {
3157     Label no_prefix, not_special;
3158 
3159     // If our stashed return pc was modified by the runtime we avoid touching it
3160     __ cmpptr(rbx, Address(rbp, wordSize));
3161     __ jccb(Assembler::notEqual, no_adjust);
3162 
3163     // Skip over the poll instruction.
3164     // See NativeInstruction::is_safepoint_poll()
3165     // Possible encodings:
3166     //      85 00       test   %eax,(%rax)
3167     //      85 01       test   %eax,(%rcx)
3168     //      85 02       test   %eax,(%rdx)
3169     //      85 03       test   %eax,(%rbx)
3170     //      85 06       test   %eax,(%rsi)
3171     //      85 07       test   %eax,(%rdi)
3172     //
3173     //   41 85 00       test   %eax,(%r8)
3174     //   41 85 01       test   %eax,(%r9)
3175     //   41 85 02       test   %eax,(%r10)
3176     //   41 85 03       test   %eax,(%r11)
3177     //   41 85 06       test   %eax,(%r14)
3178     //   41 85 07       test   %eax,(%r15)
3179     //
3180     //      85 04 24    test   %eax,(%rsp)
3181     //   41 85 04 24    test   %eax,(%r12)
3182     //      85 45 00    test   %eax,0x0(%rbp)
3183     //   41 85 45 00    test   %eax,0x0(%r13)
3184 
3185     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3186     __ jcc(Assembler::notEqual, no_prefix);
3187     __ addptr(rbx, 1);
3188     __ bind(no_prefix);
3189 #ifdef ASSERT
3190     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3191 #endif
3192     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3193     // r12/rsp 0x04
3194     // r13/rbp 0x05
3195     __ movzbq(rcx, Address(rbx, 1));
3196     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3197     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3198     __ cmpptr(rcx, 1);
3199     __ jcc(Assembler::above, not_special);
3200     __ addptr(rbx, 1);
3201     __ bind(not_special);
3202 #ifdef ASSERT
3203     // Verify the correct encoding of the poll we're about to skip.
3204     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3205     __ jcc(Assembler::notEqual, bail);
3206     // Mask out the modrm bits
3207     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3208     // rax encodes to 0, so if the bits are nonzero it's incorrect
3209     __ jcc(Assembler::notZero, bail);
3210 #endif
3211     // Adjust return pc forward to step over the safepoint poll instruction
3212     __ addptr(rbx, 2);
3213     __ movptr(Address(rbp, wordSize), rbx);
3214   }
3215 
3216   __ bind(no_adjust);
3217   // Normal exit, restore registers and exit.
3218   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3219   __ ret(0);
3220 
3221 #ifdef ASSERT
3222   __ bind(bail);
3223   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3224 #endif
3225 
3226   // Make sure all code is generated
3227   masm->flush();
3228 
3229   // Fill-out other meta info
3230   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3231 }
3232 
3233 //
3234 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3235 //
3236 // Generate a stub that calls into vm to find out the proper destination
3237 // of a java call. All the argument registers are live at this point
3238 // but since this is generic code we don't know what they are and the caller
3239 // must do any gc of the args.
3240 //
3241 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3242   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3243 
3244   // allocate space for the code
3245   ResourceMark rm;
3246 
3247   CodeBuffer buffer(name, 1200, 512);
3248   MacroAssembler* masm = new MacroAssembler(&buffer);
3249 
3250   int frame_size_in_words;
3251 
3252   OopMapSet *oop_maps = new OopMapSet();
3253   OopMap* map = NULL;
3254 
3255   int start = __ offset();
3256 
3257   // No need to save vector registers since they are caller-saved anyway.
3258   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3259 
3260   int frame_complete = __ offset();
3261 
3262   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
3263 
3264   __ mov(c_rarg0, r15_thread);
3265 
3266   __ call(RuntimeAddress(destination));
3267 
3268 
3269   // Set an oopmap for the call site.
3270   // We need this not only for callee-saved registers, but also for volatile
3271   // registers that the compiler might be keeping live across a safepoint.
3272 
3273   oop_maps->add_gc_map( __ offset() - start, map);
3274 
3275   // rax contains the address we are going to jump to assuming no exception got installed
3276 
3277   // clear last_Java_sp
3278   __ reset_last_Java_frame(false);
3279   // check for pending exceptions
3280   Label pending;
3281   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3282   __ jcc(Assembler::notEqual, pending);
3283 
3284   // get the returned Method*
3285   __ get_vm_result_2(rbx, r15_thread);
3286   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3287 
3288   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3289 
3290   RegisterSaver::restore_live_registers(masm);
3291 
3292   // We are back to the original state on entry and ready to go.
3293 
3294   __ jmp(rax);
3295 
3296   // Pending exception after the safepoint
3297 
3298   __ bind(pending);
3299 
3300   RegisterSaver::restore_live_registers(masm);
3301 
3302   // exception pending => remove activation and forward to exception handler
3303 
3304   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3305 
3306   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3307   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3308 
3309   // -------------
3310   // make sure all code is generated
3311   masm->flush();
3312 
3313   // return the  blob
3314   // frame_size_words or bytes??
3315   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3316 }
3317 
3318 //------------------------------Montgomery multiplication------------------------
3319 //
3320 
3321 #ifndef _WINDOWS
3322 
3323 // Subtract 0:b from carry:a.  Return carry.
3324 static julong
3325 sub(julong a[], julong b[], julong carry, long len) {
3326   long long i = 0, cnt = len;
3327   julong tmp;
3328   asm volatile("clc; "
3329                "0: ; "
3330                "mov (%[b], %[i], 8), %[tmp]; "
3331                "sbb %[tmp], (%[a], %[i], 8); "
3332                "inc %[i]; dec %[cnt]; "
3333                "jne 0b; "
3334                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3335                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3336                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3337                : "memory");
3338   return tmp;
3339 }
3340 
3341 // Multiply (unsigned) Long A by Long B, accumulating the double-
3342 // length result into the accumulator formed of T0, T1, and T2.
3343 #define MACC(A, B, T0, T1, T2)                                  \
3344 do {                                                            \
3345   unsigned long hi, lo;                                         \
3346   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3347            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3348            : "r"(A), "a"(B) : "cc");                            \
3349  } while(0)
3350 
3351 // As above, but add twice the double-length result into the
3352 // accumulator.
3353 #define MACC2(A, B, T0, T1, T2)                                 \
3354 do {                                                            \
3355   unsigned long hi, lo;                                         \
3356   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3357            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3358            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3359            : "r"(A), "a"(B) : "cc");                            \
3360  } while(0)
3361 
3362 #else //_WINDOWS
3363 
3364 static julong
3365 sub(julong a[], julong b[], julong carry, long len) {
3366   long i;
3367   julong tmp;
3368   unsigned char c = 1;
3369   for (i = 0; i < len; i++) {
3370     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3371     a[i] = tmp;
3372   }
3373   c = _addcarry_u64(c, carry, ~0, &tmp);
3374   return tmp;
3375 }
3376 
3377 // Multiply (unsigned) Long A by Long B, accumulating the double-
3378 // length result into the accumulator formed of T0, T1, and T2.
3379 #define MACC(A, B, T0, T1, T2)                          \
3380 do {                                                    \
3381   julong hi, lo;                            \
3382   lo = _umul128(A, B, &hi);                             \
3383   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3384   c = _addcarry_u64(c, hi, T1, &T1);                    \
3385   _addcarry_u64(c, T2, 0, &T2);                         \
3386  } while(0)
3387 
3388 // As above, but add twice the double-length result into the
3389 // accumulator.
3390 #define MACC2(A, B, T0, T1, T2)                         \
3391 do {                                                    \
3392   julong hi, lo;                            \
3393   lo = _umul128(A, B, &hi);                             \
3394   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3395   c = _addcarry_u64(c, hi, T1, &T1);                    \
3396   _addcarry_u64(c, T2, 0, &T2);                         \
3397   c = _addcarry_u64(0, lo, T0, &T0);                    \
3398   c = _addcarry_u64(c, hi, T1, &T1);                    \
3399   _addcarry_u64(c, T2, 0, &T2);                         \
3400  } while(0)
3401 
3402 #endif //_WINDOWS
3403 
3404 // Fast Montgomery multiplication.  The derivation of the algorithm is
3405 // in  A Cryptographic Library for the Motorola DSP56000,
3406 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3407 
3408 static void NOINLINE
3409 montgomery_multiply(julong a[], julong b[], julong n[],
3410                     julong m[], julong inv, int len) {
3411   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3412   int i;
3413 
3414   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3415 
3416   for (i = 0; i < len; i++) {
3417     int j;
3418     for (j = 0; j < i; j++) {
3419       MACC(a[j], b[i-j], t0, t1, t2);
3420       MACC(m[j], n[i-j], t0, t1, t2);
3421     }
3422     MACC(a[i], b[0], t0, t1, t2);
3423     m[i] = t0 * inv;
3424     MACC(m[i], n[0], t0, t1, t2);
3425 
3426     assert(t0 == 0, "broken Montgomery multiply");
3427 
3428     t0 = t1; t1 = t2; t2 = 0;
3429   }
3430 
3431   for (i = len; i < 2*len; i++) {
3432     int j;
3433     for (j = i-len+1; j < len; j++) {
3434       MACC(a[j], b[i-j], t0, t1, t2);
3435       MACC(m[j], n[i-j], t0, t1, t2);
3436     }
3437     m[i-len] = t0;
3438     t0 = t1; t1 = t2; t2 = 0;
3439   }
3440 
3441   while (t0)
3442     t0 = sub(m, n, t0, len);
3443 }
3444 
3445 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3446 // multiplies so it should be up to 25% faster than Montgomery
3447 // multiplication.  However, its loop control is more complex and it
3448 // may actually run slower on some machines.
3449 
3450 static void NOINLINE
3451 montgomery_square(julong a[], julong n[],
3452                   julong m[], julong inv, int len) {
3453   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3454   int i;
3455 
3456   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3457 
3458   for (i = 0; i < len; i++) {
3459     int j;
3460     int end = (i+1)/2;
3461     for (j = 0; j < end; j++) {
3462       MACC2(a[j], a[i-j], t0, t1, t2);
3463       MACC(m[j], n[i-j], t0, t1, t2);
3464     }
3465     if ((i & 1) == 0) {
3466       MACC(a[j], a[j], t0, t1, t2);
3467     }
3468     for (; j < i; j++) {
3469       MACC(m[j], n[i-j], t0, t1, t2);
3470     }
3471     m[i] = t0 * inv;
3472     MACC(m[i], n[0], t0, t1, t2);
3473 
3474     assert(t0 == 0, "broken Montgomery square");
3475 
3476     t0 = t1; t1 = t2; t2 = 0;
3477   }
3478 
3479   for (i = len; i < 2*len; i++) {
3480     int start = i-len+1;
3481     int end = start + (len - start)/2;
3482     int j;
3483     for (j = start; j < end; j++) {
3484       MACC2(a[j], a[i-j], t0, t1, t2);
3485       MACC(m[j], n[i-j], t0, t1, t2);
3486     }
3487     if ((i & 1) == 0) {
3488       MACC(a[j], a[j], t0, t1, t2);
3489     }
3490     for (; j < len; j++) {
3491       MACC(m[j], n[i-j], t0, t1, t2);
3492     }
3493     m[i-len] = t0;
3494     t0 = t1; t1 = t2; t2 = 0;
3495   }
3496 
3497   while (t0)
3498     t0 = sub(m, n, t0, len);
3499 }
3500 
3501 // Swap words in a longword.
3502 static julong swap(julong x) {
3503   return (x << 32) | (x >> 32);
3504 }
3505 
3506 // Copy len longwords from s to d, word-swapping as we go.  The
3507 // destination array is reversed.
3508 static void reverse_words(julong *s, julong *d, int len) {
3509   d += len;
3510   while(len-- > 0) {
3511     d--;
3512     *d = swap(*s);
3513     s++;
3514   }
3515 }
3516 
3517 // The threshold at which squaring is advantageous was determined
3518 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3519 #define MONTGOMERY_SQUARING_THRESHOLD 64
3520 
3521 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3522                                         jint len, jlong inv,
3523                                         jint *m_ints) {
3524   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3525   int longwords = len/2;
3526 
3527   // Make very sure we don't use so much space that the stack might
3528   // overflow.  512 jints corresponds to an 16384-bit integer and
3529   // will use here a total of 8k bytes of stack space.
3530   int divisor = sizeof(julong) * 4;
3531   guarantee(longwords <= 8192 / divisor, "must be");
3532   int total_allocation = longwords * sizeof (julong) * 4;
3533   julong *scratch = (julong *)alloca(total_allocation);
3534 
3535   // Local scratch arrays
3536   julong
3537     *a = scratch + 0 * longwords,
3538     *b = scratch + 1 * longwords,
3539     *n = scratch + 2 * longwords,
3540     *m = scratch + 3 * longwords;
3541 
3542   reverse_words((julong *)a_ints, a, longwords);
3543   reverse_words((julong *)b_ints, b, longwords);
3544   reverse_words((julong *)n_ints, n, longwords);
3545 
3546   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3547 
3548   reverse_words(m, (julong *)m_ints, longwords);
3549 }
3550 
3551 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3552                                       jint len, jlong inv,
3553                                       jint *m_ints) {
3554   assert(len % 2 == 0, "array length in montgomery_square must be even");
3555   int longwords = len/2;
3556 
3557   // Make very sure we don't use so much space that the stack might
3558   // overflow.  512 jints corresponds to an 16384-bit integer and
3559   // will use here a total of 6k bytes of stack space.
3560   int divisor = sizeof(julong) * 3;
3561   guarantee(longwords <= (8192 / divisor), "must be");
3562   int total_allocation = longwords * sizeof (julong) * 3;
3563   julong *scratch = (julong *)alloca(total_allocation);
3564 
3565   // Local scratch arrays
3566   julong
3567     *a = scratch + 0 * longwords,
3568     *n = scratch + 1 * longwords,
3569     *m = scratch + 2 * longwords;
3570 
3571   reverse_words((julong *)a_ints, a, longwords);
3572   reverse_words((julong *)n_ints, n, longwords);
3573 
3574   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3575     ::montgomery_square(a, n, m, (julong)inv, longwords);
3576   } else {
3577     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3578   }
3579 
3580   reverse_words(m, (julong *)m_ints, longwords);
3581 }
3582 
3583 #ifdef COMPILER2
3584 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3585 //
3586 //------------------------------generate_exception_blob---------------------------
3587 // creates exception blob at the end
3588 // Using exception blob, this code is jumped from a compiled method.
3589 // (see emit_exception_handler in x86_64.ad file)
3590 //
3591 // Given an exception pc at a call we call into the runtime for the
3592 // handler in this method. This handler might merely restore state
3593 // (i.e. callee save registers) unwind the frame and jump to the
3594 // exception handler for the nmethod if there is no Java level handler
3595 // for the nmethod.
3596 //
3597 // This code is entered with a jmp.
3598 //
3599 // Arguments:
3600 //   rax: exception oop
3601 //   rdx: exception pc
3602 //
3603 // Results:
3604 //   rax: exception oop
3605 //   rdx: exception pc in caller or ???
3606 //   destination: exception handler of caller
3607 //
3608 // Note: the exception pc MUST be at a call (precise debug information)
3609 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3610 //
3611 
3612 void OptoRuntime::generate_exception_blob() {
3613   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3614   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3615   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3616 
3617   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3618 
3619   // Allocate space for the code
3620   ResourceMark rm;
3621   // Setup code generation tools
3622   CodeBuffer buffer("exception_blob", 2048, 1024);
3623   MacroAssembler* masm = new MacroAssembler(&buffer);
3624 
3625 
3626   address start = __ pc();
3627 
3628   // Exception pc is 'return address' for stack walker
3629   __ push(rdx);
3630   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3631 
3632   // Save callee-saved registers.  See x86_64.ad.
3633 
3634   // rbp is an implicitly saved callee saved register (i.e., the calling
3635   // convention will save/restore it in the prolog/epilog). Other than that
3636   // there are no callee save registers now that adapter frames are gone.
3637 
3638   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3639 
3640   // Store exception in Thread object. We cannot pass any arguments to the
3641   // handle_exception call, since we do not want to make any assumption
3642   // about the size of the frame where the exception happened in.
3643   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3644   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3645   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3646 
3647   // This call does all the hard work.  It checks if an exception handler
3648   // exists in the method.
3649   // If so, it returns the handler address.
3650   // If not, it prepares for stack-unwinding, restoring the callee-save
3651   // registers of the frame being removed.
3652   //
3653   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3654 
3655   // At a method handle call, the stack may not be properly aligned
3656   // when returning with an exception.
3657   address the_pc = __ pc();
3658   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3659   __ mov(c_rarg0, r15_thread);
3660   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3661   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3662 
3663   // Set an oopmap for the call site.  This oopmap will only be used if we
3664   // are unwinding the stack.  Hence, all locations will be dead.
3665   // Callee-saved registers will be the same as the frame above (i.e.,
3666   // handle_exception_stub), since they were restored when we got the
3667   // exception.
3668 
3669   OopMapSet* oop_maps = new OopMapSet();
3670 
3671   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3672 
3673   __ reset_last_Java_frame(false);
3674 
3675   // Restore callee-saved registers
3676 
3677   // rbp is an implicitly saved callee-saved register (i.e., the calling
3678   // convention will save restore it in prolog/epilog) Other than that
3679   // there are no callee save registers now that adapter frames are gone.
3680 
3681   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3682 
3683   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3684   __ pop(rdx);                  // No need for exception pc anymore
3685 
3686   // rax: exception handler
3687 
3688   // We have a handler in rax (could be deopt blob).
3689   __ mov(r8, rax);
3690 
3691   // Get the exception oop
3692   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3693   // Get the exception pc in case we are deoptimized
3694   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3695 #ifdef ASSERT
3696   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3697   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3698 #endif
3699   // Clear the exception oop so GC no longer processes it as a root.
3700   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3701 
3702   // rax: exception oop
3703   // r8:  exception handler
3704   // rdx: exception pc
3705   // Jump to handler
3706 
3707   __ jmp(r8);
3708 
3709   // Make sure all code is generated
3710   masm->flush();
3711 
3712   // Set exception blob
3713   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3714 }
3715 #endif // COMPILER2
3716