1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/icBuffer.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/compiledICHolder.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/jniHandles.hpp"
  48 #include "runtime/safepointMechanism.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/signature.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "runtime/vframeArray.hpp"
  53 #include "runtime/vm_version.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/formatBuffer.hpp"
  56 #include "vmreg_x86.inline.hpp"
  57 #ifdef COMPILER1
  58 #include "c1/c1_Runtime1.hpp"
  59 #endif
  60 #ifdef COMPILER2
  61 #include "opto/runtime.hpp"
  62 #endif
  63 #if INCLUDE_JVMCI
  64 #include "jvmci/jvmciJavaClasses.hpp"
  65 #endif
  66 
  67 #define __ masm->
  68 
  69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  70 
  71 class SimpleRuntimeFrame {
  72 
  73   public:
  74 
  75   // Most of the runtime stubs have this simple frame layout.
  76   // This class exists to make the layout shared in one place.
  77   // Offsets are for compiler stack slots, which are jints.
  78   enum layout {
  79     // The frame sender code expects that rbp will be in the "natural" place and
  80     // will override any oopMap setting for it. We must therefore force the layout
  81     // so that it agrees with the frame sender code.
  82     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  83     rbp_off2,
  84     return_off, return_off2,
  85     framesize
  86   };
  87 };
  88 
  89 class RegisterSaver {
  90   // Capture info about frame layout.  Layout offsets are in jint
  91   // units because compiler frame slots are jints.
  92 #define XSAVE_AREA_BEGIN 160
  93 #define XSAVE_AREA_YMM_BEGIN 576
  94 #define XSAVE_AREA_OPMASK_BEGIN 1088
  95 #define XSAVE_AREA_ZMM_BEGIN 1152
  96 #define XSAVE_AREA_UPPERBANK 1664
  97 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  98 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  99 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 100 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 102   enum layout {
 103     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 104     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 105     DEF_XMM_OFFS(0),
 106     DEF_XMM_OFFS(1),
 107     // 2..15 are implied in range usage
 108     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 109     DEF_YMM_OFFS(0),
 110     DEF_YMM_OFFS(1),
 111     // 2..15 are implied in range usage
 112     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 113     DEF_OPMASK_OFFS(0),
 114     DEF_OPMASK_OFFS(1),
 115     // 2..7 are implied in range usage
 116     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 117     DEF_ZMM_OFFS(0),
 118     DEF_ZMM_OFFS(1),
 119     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 120     DEF_ZMM_UPPER_OFFS(16),
 121     DEF_ZMM_UPPER_OFFS(17),
 122     // 18..31 are implied in range usage
 123     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 124     fpu_stateH_end,
 125     r15_off, r15H_off,
 126     r14_off, r14H_off,
 127     r13_off, r13H_off,
 128     r12_off, r12H_off,
 129     r11_off, r11H_off,
 130     r10_off, r10H_off,
 131     r9_off,  r9H_off,
 132     r8_off,  r8H_off,
 133     rdi_off, rdiH_off,
 134     rsi_off, rsiH_off,
 135     ignore_off, ignoreH_off,  // extra copy of rbp
 136     rsp_off, rspH_off,
 137     rbx_off, rbxH_off,
 138     rdx_off, rdxH_off,
 139     rcx_off, rcxH_off,
 140     rax_off, raxH_off,
 141     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 142     align_off, alignH_off,
 143     flags_off, flagsH_off,
 144     // The frame sender code expects that rbp will be in the "natural" place and
 145     // will override any oopMap setting for it. We must therefore force the layout
 146     // so that it agrees with the frame sender code.
 147     rbp_off, rbpH_off,        // copy of rbp we will restore
 148     return_off, returnH_off,  // slot for return address
 149     reg_save_size             // size in compiler stack slots
 150   };
 151 
 152  public:
 153   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors);
 154   static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 155 
 156   // Offsets into the register save area
 157   // Used by deoptimization when it is managing result register
 158   // values on its own
 159 
 160   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 161   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 162   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 163   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 164   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 165 
 166   // During deoptimization only the result registers need to be restored,
 167   // all the other values have already been extracted.
 168   static void restore_result_registers(MacroAssembler* masm);
 169 };
 170 
 171 // Register is a class, but it would be assigned numerical value.
 172 // "0" is assigned for rax. Thus we need to ignore -Wnonnull.
 173 PRAGMA_DIAG_PUSH
 174 PRAGMA_NONNULL_IGNORED
 175 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
 176   int off = 0;
 177   int num_xmm_regs = XMMRegisterImpl::available_xmm_registers();
 178 #if COMPILER2_OR_JVMCI
 179   if (save_vectors && UseAVX == 0) {
 180     save_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 181   }
 182   assert(!save_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 183 #else
 184   save_vectors = false; // vectors are generated only by C2 and JVMCI
 185 #endif
 186 
 187   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 188   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 189   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 190   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 191   // CodeBlob frame size is in words.
 192   int frame_size_in_words = frame_size_in_bytes / wordSize;
 193   *total_frame_words = frame_size_in_words;
 194 
 195   // Save registers, fpu state, and flags.
 196   // We assume caller has already pushed the return address onto the
 197   // stack, so rsp is 8-byte aligned here.
 198   // We push rpb twice in this sequence because we want the real rbp
 199   // to be under the return like a normal enter.
 200 
 201   __ enter();          // rsp becomes 16-byte aligned here
 202   __ push_CPU_state(); // Push a multiple of 16 bytes
 203 
 204   // push cpu state handles this on EVEX enabled targets
 205   if (save_vectors) {
 206     // Save upper half of YMM registers(0..15)
 207     int base_addr = XSAVE_AREA_YMM_BEGIN;
 208     for (int n = 0; n < 16; n++) {
 209       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 210     }
 211     if (VM_Version::supports_evex()) {
 212       // Save upper half of ZMM registers(0..15)
 213       base_addr = XSAVE_AREA_ZMM_BEGIN;
 214       for (int n = 0; n < 16; n++) {
 215         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 216       }
 217       // Save full ZMM registers(16..num_xmm_regs)
 218       base_addr = XSAVE_AREA_UPPERBANK;
 219       off = 0;
 220       int vector_len = Assembler::AVX_512bit;
 221       for (int n = 16; n < num_xmm_regs; n++) {
 222         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 223       }
 224 #if COMPILER2_OR_JVMCI
 225       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 226       off = 0;
 227       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 228         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 229       }
 230 #endif
 231     }
 232   } else {
 233     if (VM_Version::supports_evex()) {
 234       // Save upper bank of ZMM registers(16..31) for double/float usage
 235       int base_addr = XSAVE_AREA_UPPERBANK;
 236       off = 0;
 237       for (int n = 16; n < num_xmm_regs; n++) {
 238         __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
 239       }
 240 #if COMPILER2_OR_JVMCI
 241       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 242       off = 0;
 243       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 244         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 245       }
 246 #endif
 247     }
 248   }
 249   __ vzeroupper();
 250   if (frame::arg_reg_save_area_bytes != 0) {
 251     // Allocate argument register save area
 252     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 253   }
 254 
 255   // Set an oopmap for the call site.  This oopmap will map all
 256   // oop-registers and debug-info registers as callee-saved.  This
 257   // will allow deoptimization at this safepoint to find all possible
 258   // debug-info recordings, as well as let GC find all oops.
 259 
 260   OopMapSet *oop_maps = new OopMapSet();
 261   OopMap* map = new OopMap(frame_size_in_slots, 0);
 262 
 263 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 264 
 265   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 266   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 267   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 269   // rbp location is known implicitly by the frame sender code, needs no oopmap
 270   // and the location where rbp was saved by is ignored
 271   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 272   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 273   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 281   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 282   // on EVEX enabled targets, we get it included in the xsave area
 283   off = xmm0_off;
 284   int delta = xmm1_off - off;
 285   for (int n = 0; n < 16; n++) {
 286     XMMRegister xmm_name = as_XMMRegister(n);
 287     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 288     off += delta;
 289   }
 290   if (UseAVX > 2) {
 291     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 292     off = zmm16_off;
 293     delta = zmm17_off - off;
 294     for (int n = 16; n < num_xmm_regs; n++) {
 295       XMMRegister zmm_name = as_XMMRegister(n);
 296       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 297       off += delta;
 298     }
 299   }
 300 
 301 #if COMPILER2_OR_JVMCI
 302   if (save_vectors) {
 303     // Save upper half of YMM registers(0..15)
 304     off = ymm0_off;
 305     delta = ymm1_off - ymm0_off;
 306     for (int n = 0; n < 16; n++) {
 307       XMMRegister ymm_name = as_XMMRegister(n);
 308       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 309       off += delta;
 310     }
 311     if (VM_Version::supports_evex()) {
 312       // Save upper half of ZMM registers(0..15)
 313       off = zmm0_off;
 314       delta = zmm1_off - zmm0_off;
 315       for (int n = 0; n < 16; n++) {
 316         XMMRegister zmm_name = as_XMMRegister(n);
 317         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 318         off += delta;
 319       }
 320     }
 321   }
 322 #endif // COMPILER2_OR_JVMCI
 323 
 324   // %%% These should all be a waste but we'll keep things as they were for now
 325   if (true) {
 326     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 327     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 328     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 330     // rbp location is known implicitly by the frame sender code, needs no oopmap
 331     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 341     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 342     // on EVEX enabled targets, we get it included in the xsave area
 343     off = xmm0H_off;
 344     delta = xmm1H_off - off;
 345     for (int n = 0; n < 16; n++) {
 346       XMMRegister xmm_name = as_XMMRegister(n);
 347       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 348       off += delta;
 349     }
 350     if (UseAVX > 2) {
 351       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 352       off = zmm16H_off;
 353       delta = zmm17H_off - off;
 354       for (int n = 16; n < num_xmm_regs; n++) {
 355         XMMRegister zmm_name = as_XMMRegister(n);
 356         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 357         off += delta;
 358       }
 359     }
 360   }
 361 
 362   return map;
 363 }
 364 PRAGMA_DIAG_POP
 365 
 366 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
 367   int num_xmm_regs = XMMRegisterImpl::available_xmm_registers();
 368   if (frame::arg_reg_save_area_bytes != 0) {
 369     // Pop arg register save area
 370     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 371   }
 372 
 373 #if COMPILER2_OR_JVMCI
 374   if (restore_vectors) {
 375     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 376     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 377   }
 378 #else
 379   assert(!restore_vectors, "vectors are generated only by C2");
 380 #endif
 381 
 382   __ vzeroupper();
 383 
 384   // On EVEX enabled targets everything is handled in pop fpu state
 385   if (restore_vectors) {
 386     // Restore upper half of YMM registers (0..15)
 387     int base_addr = XSAVE_AREA_YMM_BEGIN;
 388     for (int n = 0; n < 16; n++) {
 389       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 390     }
 391     if (VM_Version::supports_evex()) {
 392       // Restore upper half of ZMM registers (0..15)
 393       base_addr = XSAVE_AREA_ZMM_BEGIN;
 394       for (int n = 0; n < 16; n++) {
 395         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 396       }
 397       // Restore full ZMM registers(16..num_xmm_regs)
 398       base_addr = XSAVE_AREA_UPPERBANK;
 399       int vector_len = Assembler::AVX_512bit;
 400       int off = 0;
 401       for (int n = 16; n < num_xmm_regs; n++) {
 402         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 403       }
 404 #if COMPILER2_OR_JVMCI
 405       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 406       off = 0;
 407       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 408         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 409       }
 410 #endif
 411     }
 412   } else {
 413     if (VM_Version::supports_evex()) {
 414       // Restore upper bank of ZMM registers(16..31) for double/float usage
 415       int base_addr = XSAVE_AREA_UPPERBANK;
 416       int off = 0;
 417       for (int n = 16; n < num_xmm_regs; n++) {
 418         __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
 419       }
 420 #if COMPILER2_OR_JVMCI
 421       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 422       off = 0;
 423       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 424         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 425       }
 426 #endif
 427     }
 428   }
 429 
 430   // Recover CPU state
 431   __ pop_CPU_state();
 432   // Get the rbp described implicitly by the calling convention (no oopMap)
 433   __ pop(rbp);
 434 }
 435 
 436 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 437 
 438   // Just restore result register. Only used by deoptimization. By
 439   // now any callee save register that needs to be restored to a c2
 440   // caller of the deoptee has been extracted into the vframeArray
 441   // and will be stuffed into the c2i adapter we create for later
 442   // restoration so only result registers need to be restored here.
 443 
 444   // Restore fp result register
 445   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 446   // Restore integer result register
 447   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 448   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 449 
 450   // Pop all of the register save are off the stack except the return address
 451   __ addptr(rsp, return_offset_in_bytes());
 452 }
 453 
 454 // Is vector's size (in bytes) bigger than a size saved by default?
 455 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 456 bool SharedRuntime::is_wide_vector(int size) {
 457   return size > 16;
 458 }
 459 
 460 // ---------------------------------------------------------------------------
 461 // Read the array of BasicTypes from a signature, and compute where the
 462 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 463 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 464 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 465 // as framesizes are fixed.
 466 // VMRegImpl::stack0 refers to the first slot 0(sp).
 467 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 468 // up to RegisterImpl::number_of_registers) are the 64-bit
 469 // integer registers.
 470 
 471 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 472 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 473 // units regardless of build. Of course for i486 there is no 64 bit build
 474 
 475 // The Java calling convention is a "shifted" version of the C ABI.
 476 // By skipping the first C ABI register we can call non-static jni methods
 477 // with small numbers of arguments without having to shuffle the arguments
 478 // at all. Since we control the java ABI we ought to at least get some
 479 // advantage out of it.
 480 
 481 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 482                                            VMRegPair *regs,
 483                                            int total_args_passed) {
 484 
 485   // Create the mapping between argument positions and
 486   // registers.
 487   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 488     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 489   };
 490   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 491     j_farg0, j_farg1, j_farg2, j_farg3,
 492     j_farg4, j_farg5, j_farg6, j_farg7
 493   };
 494 
 495 
 496   uint int_args = 0;
 497   uint fp_args = 0;
 498   uint stk_args = 0; // inc by 2 each time
 499 
 500   for (int i = 0; i < total_args_passed; i++) {
 501     switch (sig_bt[i]) {
 502     case T_BOOLEAN:
 503     case T_CHAR:
 504     case T_BYTE:
 505     case T_SHORT:
 506     case T_INT:
 507       if (int_args < Argument::n_int_register_parameters_j) {
 508         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 509       } else {
 510         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 511         stk_args += 2;
 512       }
 513       break;
 514     case T_VOID:
 515       // halves of T_LONG or T_DOUBLE
 516       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 517       regs[i].set_bad();
 518       break;
 519     case T_LONG:
 520       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 521       // fall through
 522     case T_OBJECT:
 523     case T_ARRAY:
 524     case T_ADDRESS:
 525       if (int_args < Argument::n_int_register_parameters_j) {
 526         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 527       } else {
 528         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 529         stk_args += 2;
 530       }
 531       break;
 532     case T_FLOAT:
 533       if (fp_args < Argument::n_float_register_parameters_j) {
 534         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 535       } else {
 536         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 537         stk_args += 2;
 538       }
 539       break;
 540     case T_DOUBLE:
 541       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 542       if (fp_args < Argument::n_float_register_parameters_j) {
 543         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 544       } else {
 545         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 546         stk_args += 2;
 547       }
 548       break;
 549     default:
 550       ShouldNotReachHere();
 551       break;
 552     }
 553   }
 554 
 555   return align_up(stk_args, 2);
 556 }
 557 
 558 // Patch the callers callsite with entry to compiled code if it exists.
 559 static void patch_callers_callsite(MacroAssembler *masm) {
 560   Label L;
 561   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 562   __ jcc(Assembler::equal, L);
 563 
 564   // Save the current stack pointer
 565   __ mov(r13, rsp);
 566   // Schedule the branch target address early.
 567   // Call into the VM to patch the caller, then jump to compiled callee
 568   // rax isn't live so capture return address while we easily can
 569   __ movptr(rax, Address(rsp, 0));
 570 
 571   // align stack so push_CPU_state doesn't fault
 572   __ andptr(rsp, -(StackAlignmentInBytes));
 573   __ push_CPU_state();
 574   __ vzeroupper();
 575   // VM needs caller's callsite
 576   // VM needs target method
 577   // This needs to be a long call since we will relocate this adapter to
 578   // the codeBuffer and it may not reach
 579 
 580   // Allocate argument register save area
 581   if (frame::arg_reg_save_area_bytes != 0) {
 582     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 583   }
 584   __ mov(c_rarg0, rbx);
 585   __ mov(c_rarg1, rax);
 586   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 587 
 588   // De-allocate argument register save area
 589   if (frame::arg_reg_save_area_bytes != 0) {
 590     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 591   }
 592 
 593   __ vzeroupper();
 594   __ pop_CPU_state();
 595   // restore sp
 596   __ mov(rsp, r13);
 597   __ bind(L);
 598 }
 599 
 600 
 601 static void gen_c2i_adapter(MacroAssembler *masm,
 602                             int total_args_passed,
 603                             int comp_args_on_stack,
 604                             const BasicType *sig_bt,
 605                             const VMRegPair *regs,
 606                             Label& skip_fixup) {
 607   // Before we get into the guts of the C2I adapter, see if we should be here
 608   // at all.  We've come from compiled code and are attempting to jump to the
 609   // interpreter, which means the caller made a static call to get here
 610   // (vcalls always get a compiled target if there is one).  Check for a
 611   // compiled target.  If there is one, we need to patch the caller's call.
 612   patch_callers_callsite(masm);
 613 
 614   __ bind(skip_fixup);
 615 
 616   // Since all args are passed on the stack, total_args_passed *
 617   // Interpreter::stackElementSize is the space we need. Plus 1 because
 618   // we also account for the return address location since
 619   // we store it first rather than hold it in rax across all the shuffling
 620 
 621   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 622 
 623   // stack is aligned, keep it that way
 624   extraspace = align_up(extraspace, 2*wordSize);
 625 
 626   // Get return address
 627   __ pop(rax);
 628 
 629   // set senderSP value
 630   __ mov(r13, rsp);
 631 
 632   __ subptr(rsp, extraspace);
 633 
 634   // Store the return address in the expected location
 635   __ movptr(Address(rsp, 0), rax);
 636 
 637   // Now write the args into the outgoing interpreter space
 638   for (int i = 0; i < total_args_passed; i++) {
 639     if (sig_bt[i] == T_VOID) {
 640       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 641       continue;
 642     }
 643 
 644     // offset to start parameters
 645     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 646     int next_off = st_off - Interpreter::stackElementSize;
 647 
 648     // Say 4 args:
 649     // i   st_off
 650     // 0   32 T_LONG
 651     // 1   24 T_VOID
 652     // 2   16 T_OBJECT
 653     // 3    8 T_BOOL
 654     // -    0 return address
 655     //
 656     // However to make thing extra confusing. Because we can fit a long/double in
 657     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 658     // leaves one slot empty and only stores to a single slot. In this case the
 659     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 660 
 661     VMReg r_1 = regs[i].first();
 662     VMReg r_2 = regs[i].second();
 663     if (!r_1->is_valid()) {
 664       assert(!r_2->is_valid(), "");
 665       continue;
 666     }
 667     if (r_1->is_stack()) {
 668       // memory to memory use rax
 669       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 670       if (!r_2->is_valid()) {
 671         // sign extend??
 672         __ movl(rax, Address(rsp, ld_off));
 673         __ movptr(Address(rsp, st_off), rax);
 674 
 675       } else {
 676 
 677         __ movq(rax, Address(rsp, ld_off));
 678 
 679         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 680         // T_DOUBLE and T_LONG use two slots in the interpreter
 681         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 682           // ld_off == LSW, ld_off+wordSize == MSW
 683           // st_off == MSW, next_off == LSW
 684           __ movq(Address(rsp, next_off), rax);
 685 #ifdef ASSERT
 686           // Overwrite the unused slot with known junk
 687           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 688           __ movptr(Address(rsp, st_off), rax);
 689 #endif /* ASSERT */
 690         } else {
 691           __ movq(Address(rsp, st_off), rax);
 692         }
 693       }
 694     } else if (r_1->is_Register()) {
 695       Register r = r_1->as_Register();
 696       if (!r_2->is_valid()) {
 697         // must be only an int (or less ) so move only 32bits to slot
 698         // why not sign extend??
 699         __ movl(Address(rsp, st_off), r);
 700       } else {
 701         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 702         // T_DOUBLE and T_LONG use two slots in the interpreter
 703         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 704           // long/double in gpr
 705 #ifdef ASSERT
 706           // Overwrite the unused slot with known junk
 707           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 708           __ movptr(Address(rsp, st_off), rax);
 709 #endif /* ASSERT */
 710           __ movq(Address(rsp, next_off), r);
 711         } else {
 712           __ movptr(Address(rsp, st_off), r);
 713         }
 714       }
 715     } else {
 716       assert(r_1->is_XMMRegister(), "");
 717       if (!r_2->is_valid()) {
 718         // only a float use just part of the slot
 719         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 720       } else {
 721 #ifdef ASSERT
 722         // Overwrite the unused slot with known junk
 723         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 724         __ movptr(Address(rsp, st_off), rax);
 725 #endif /* ASSERT */
 726         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 727       }
 728     }
 729   }
 730 
 731   // Schedule the branch target address early.
 732   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 733   __ jmp(rcx);
 734 }
 735 
 736 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 737                         address code_start, address code_end,
 738                         Label& L_ok) {
 739   Label L_fail;
 740   __ lea(temp_reg, ExternalAddress(code_start));
 741   __ cmpptr(pc_reg, temp_reg);
 742   __ jcc(Assembler::belowEqual, L_fail);
 743   __ lea(temp_reg, ExternalAddress(code_end));
 744   __ cmpptr(pc_reg, temp_reg);
 745   __ jcc(Assembler::below, L_ok);
 746   __ bind(L_fail);
 747 }
 748 
 749 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 750                                     int total_args_passed,
 751                                     int comp_args_on_stack,
 752                                     const BasicType *sig_bt,
 753                                     const VMRegPair *regs) {
 754 
 755   // Note: r13 contains the senderSP on entry. We must preserve it since
 756   // we may do a i2c -> c2i transition if we lose a race where compiled
 757   // code goes non-entrant while we get args ready.
 758   // In addition we use r13 to locate all the interpreter args as
 759   // we must align the stack to 16 bytes on an i2c entry else we
 760   // lose alignment we expect in all compiled code and register
 761   // save code can segv when fxsave instructions find improperly
 762   // aligned stack pointer.
 763 
 764   // Adapters can be frameless because they do not require the caller
 765   // to perform additional cleanup work, such as correcting the stack pointer.
 766   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 767   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 768   // even if a callee has modified the stack pointer.
 769   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 770   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 771   // up via the senderSP register).
 772   // In other words, if *either* the caller or callee is interpreted, we can
 773   // get the stack pointer repaired after a call.
 774   // This is why c2i and i2c adapters cannot be indefinitely composed.
 775   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 776   // both caller and callee would be compiled methods, and neither would
 777   // clean up the stack pointer changes performed by the two adapters.
 778   // If this happens, control eventually transfers back to the compiled
 779   // caller, but with an uncorrected stack, causing delayed havoc.
 780 
 781   // Pick up the return address
 782   __ movptr(rax, Address(rsp, 0));
 783 
 784   if (VerifyAdapterCalls &&
 785       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 786     // So, let's test for cascading c2i/i2c adapters right now.
 787     //  assert(Interpreter::contains($return_addr) ||
 788     //         StubRoutines::contains($return_addr),
 789     //         "i2c adapter must return to an interpreter frame");
 790     __ block_comment("verify_i2c { ");
 791     Label L_ok;
 792     if (Interpreter::code() != NULL)
 793       range_check(masm, rax, r11,
 794                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 795                   L_ok);
 796     if (StubRoutines::code1() != NULL)
 797       range_check(masm, rax, r11,
 798                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 799                   L_ok);
 800     if (StubRoutines::code2() != NULL)
 801       range_check(masm, rax, r11,
 802                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 803                   L_ok);
 804     const char* msg = "i2c adapter must return to an interpreter frame";
 805     __ block_comment(msg);
 806     __ stop(msg);
 807     __ bind(L_ok);
 808     __ block_comment("} verify_i2ce ");
 809   }
 810 
 811   // Must preserve original SP for loading incoming arguments because
 812   // we need to align the outgoing SP for compiled code.
 813   __ movptr(r11, rsp);
 814 
 815   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
 816   // in registers, we will occasionally have no stack args.
 817   int comp_words_on_stack = 0;
 818   if (comp_args_on_stack) {
 819     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
 820     // registers are below.  By subtracting stack0, we either get a negative
 821     // number (all values in registers) or the maximum stack slot accessed.
 822 
 823     // Convert 4-byte c2 stack slots to words.
 824     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 825     // Round up to miminum stack alignment, in wordSize
 826     comp_words_on_stack = align_up(comp_words_on_stack, 2);
 827     __ subptr(rsp, comp_words_on_stack * wordSize);
 828   }
 829 
 830 
 831   // Ensure compiled code always sees stack at proper alignment
 832   __ andptr(rsp, -16);
 833 
 834   // push the return address and misalign the stack that youngest frame always sees
 835   // as far as the placement of the call instruction
 836   __ push(rax);
 837 
 838   // Put saved SP in another register
 839   const Register saved_sp = rax;
 840   __ movptr(saved_sp, r11);
 841 
 842   // Will jump to the compiled code just as if compiled code was doing it.
 843   // Pre-load the register-jump target early, to schedule it better.
 844   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 845 
 846 #if INCLUDE_JVMCI
 847   if (EnableJVMCI) {
 848     // check if this call should be routed towards a specific entry point
 849     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 850     Label no_alternative_target;
 851     __ jcc(Assembler::equal, no_alternative_target);
 852     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 853     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 854     __ bind(no_alternative_target);
 855   }
 856 #endif // INCLUDE_JVMCI
 857 
 858   // Now generate the shuffle code.  Pick up all register args and move the
 859   // rest through the floating point stack top.
 860   for (int i = 0; i < total_args_passed; i++) {
 861     if (sig_bt[i] == T_VOID) {
 862       // Longs and doubles are passed in native word order, but misaligned
 863       // in the 32-bit build.
 864       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 865       continue;
 866     }
 867 
 868     // Pick up 0, 1 or 2 words from SP+offset.
 869 
 870     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 871             "scrambled load targets?");
 872     // Load in argument order going down.
 873     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 874     // Point to interpreter value (vs. tag)
 875     int next_off = ld_off - Interpreter::stackElementSize;
 876     //
 877     //
 878     //
 879     VMReg r_1 = regs[i].first();
 880     VMReg r_2 = regs[i].second();
 881     if (!r_1->is_valid()) {
 882       assert(!r_2->is_valid(), "");
 883       continue;
 884     }
 885     if (r_1->is_stack()) {
 886       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 887       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 888 
 889       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 890       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 891       // will be generated.
 892       if (!r_2->is_valid()) {
 893         // sign extend???
 894         __ movl(r13, Address(saved_sp, ld_off));
 895         __ movptr(Address(rsp, st_off), r13);
 896       } else {
 897         //
 898         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 899         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 900         // So we must adjust where to pick up the data to match the interpreter.
 901         //
 902         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 903         // are accessed as negative so LSW is at LOW address
 904 
 905         // ld_off is MSW so get LSW
 906         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 907                            next_off : ld_off;
 908         __ movq(r13, Address(saved_sp, offset));
 909         // st_off is LSW (i.e. reg.first())
 910         __ movq(Address(rsp, st_off), r13);
 911       }
 912     } else if (r_1->is_Register()) {  // Register argument
 913       Register r = r_1->as_Register();
 914       assert(r != rax, "must be different");
 915       if (r_2->is_valid()) {
 916         //
 917         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 918         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 919         // So we must adjust where to pick up the data to match the interpreter.
 920 
 921         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 922                            next_off : ld_off;
 923 
 924         // this can be a misaligned move
 925         __ movq(r, Address(saved_sp, offset));
 926       } else {
 927         // sign extend and use a full word?
 928         __ movl(r, Address(saved_sp, ld_off));
 929       }
 930     } else {
 931       if (!r_2->is_valid()) {
 932         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 933       } else {
 934         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 935       }
 936     }
 937   }
 938 
 939   // 6243940 We might end up in handle_wrong_method if
 940   // the callee is deoptimized as we race thru here. If that
 941   // happens we don't want to take a safepoint because the
 942   // caller frame will look interpreted and arguments are now
 943   // "compiled" so it is much better to make this transition
 944   // invisible to the stack walking code. Unfortunately if
 945   // we try and find the callee by normal means a safepoint
 946   // is possible. So we stash the desired callee in the thread
 947   // and the vm will find there should this case occur.
 948 
 949   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 950 
 951   // put Method* where a c2i would expect should we end up there
 952   // only needed because eof c2 resolve stubs return Method* as a result in
 953   // rax
 954   __ mov(rax, rbx);
 955   __ jmp(r11);
 956 }
 957 
 958 // ---------------------------------------------------------------
 959 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 960                                                             int total_args_passed,
 961                                                             int comp_args_on_stack,
 962                                                             const BasicType *sig_bt,
 963                                                             const VMRegPair *regs,
 964                                                             AdapterFingerPrint* fingerprint) {
 965   address i2c_entry = __ pc();
 966 
 967   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 968 
 969   // -------------------------------------------------------------------------
 970   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 971   // to the interpreter.  The args start out packed in the compiled layout.  They
 972   // need to be unpacked into the interpreter layout.  This will almost always
 973   // require some stack space.  We grow the current (compiled) stack, then repack
 974   // the args.  We  finally end in a jump to the generic interpreter entry point.
 975   // On exit from the interpreter, the interpreter will restore our SP (lest the
 976   // compiled code, which relies solely on SP and not RBP, get sick).
 977 
 978   address c2i_unverified_entry = __ pc();
 979   Label skip_fixup;
 980   Label ok;
 981 
 982   Register holder = rax;
 983   Register receiver = j_rarg0;
 984   Register temp = rbx;
 985 
 986   {
 987     __ load_klass(temp, receiver, rscratch1);
 988     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
 989     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
 990     __ jcc(Assembler::equal, ok);
 991     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 992 
 993     __ bind(ok);
 994     // Method might have been compiled since the call site was patched to
 995     // interpreted if that is the case treat it as a miss so we can get
 996     // the call site corrected.
 997     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 998     __ jcc(Assembler::equal, skip_fixup);
 999     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1000   }
1001 
1002   address c2i_entry = __ pc();
1003 
1004   // Class initialization barrier for static methods
1005   address c2i_no_clinit_check_entry = NULL;
1006   if (VM_Version::supports_fast_class_init_checks()) {
1007     Label L_skip_barrier;
1008     Register method = rbx;
1009 
1010     { // Bypass the barrier for non-static methods
1011       Register flags  = rscratch1;
1012       __ movl(flags, Address(method, Method::access_flags_offset()));
1013       __ testl(flags, JVM_ACC_STATIC);
1014       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1015     }
1016 
1017     Register klass = rscratch1;
1018     __ load_method_holder(klass, method);
1019     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1020 
1021     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1022 
1023     __ bind(L_skip_barrier);
1024     c2i_no_clinit_check_entry = __ pc();
1025   }
1026 
1027   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1028   bs->c2i_entry_barrier(masm);
1029 
1030   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1031 
1032   __ flush();
1033   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1034 }
1035 
1036 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1037                                          VMRegPair *regs,
1038                                          VMRegPair *regs2,
1039                                          int total_args_passed) {
1040   assert(regs2 == NULL, "not needed on x86");
1041 // We return the amount of VMRegImpl stack slots we need to reserve for all
1042 // the arguments NOT counting out_preserve_stack_slots.
1043 
1044 // NOTE: These arrays will have to change when c1 is ported
1045 #ifdef _WIN64
1046     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1047       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1048     };
1049     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1050       c_farg0, c_farg1, c_farg2, c_farg3
1051     };
1052 #else
1053     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1054       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1055     };
1056     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1057       c_farg0, c_farg1, c_farg2, c_farg3,
1058       c_farg4, c_farg5, c_farg6, c_farg7
1059     };
1060 #endif // _WIN64
1061 
1062 
1063     uint int_args = 0;
1064     uint fp_args = 0;
1065     uint stk_args = 0; // inc by 2 each time
1066 
1067     for (int i = 0; i < total_args_passed; i++) {
1068       switch (sig_bt[i]) {
1069       case T_BOOLEAN:
1070       case T_CHAR:
1071       case T_BYTE:
1072       case T_SHORT:
1073       case T_INT:
1074         if (int_args < Argument::n_int_register_parameters_c) {
1075           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1076 #ifdef _WIN64
1077           fp_args++;
1078           // Allocate slots for callee to stuff register args the stack.
1079           stk_args += 2;
1080 #endif
1081         } else {
1082           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1083           stk_args += 2;
1084         }
1085         break;
1086       case T_LONG:
1087         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1088         // fall through
1089       case T_OBJECT:
1090       case T_ARRAY:
1091       case T_ADDRESS:
1092       case T_METADATA:
1093         if (int_args < Argument::n_int_register_parameters_c) {
1094           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1095 #ifdef _WIN64
1096           fp_args++;
1097           stk_args += 2;
1098 #endif
1099         } else {
1100           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1101           stk_args += 2;
1102         }
1103         break;
1104       case T_FLOAT:
1105         if (fp_args < Argument::n_float_register_parameters_c) {
1106           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1107 #ifdef _WIN64
1108           int_args++;
1109           // Allocate slots for callee to stuff register args the stack.
1110           stk_args += 2;
1111 #endif
1112         } else {
1113           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1114           stk_args += 2;
1115         }
1116         break;
1117       case T_DOUBLE:
1118         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1119         if (fp_args < Argument::n_float_register_parameters_c) {
1120           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1121 #ifdef _WIN64
1122           int_args++;
1123           // Allocate slots for callee to stuff register args the stack.
1124           stk_args += 2;
1125 #endif
1126         } else {
1127           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1128           stk_args += 2;
1129         }
1130         break;
1131       case T_VOID: // Halves of longs and doubles
1132         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1133         regs[i].set_bad();
1134         break;
1135       default:
1136         ShouldNotReachHere();
1137         break;
1138       }
1139     }
1140 #ifdef _WIN64
1141   // windows abi requires that we always allocate enough stack space
1142   // for 4 64bit registers to be stored down.
1143   if (stk_args < 8) {
1144     stk_args = 8;
1145   }
1146 #endif // _WIN64
1147 
1148   return stk_args;
1149 }
1150 
1151 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1152                                              uint num_bits,
1153                                              uint total_args_passed) {
1154   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1155          "only certain vector sizes are supported for now");
1156 
1157   static const XMMRegister VEC_ArgReg[32] = {
1158      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1159      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1160     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1161     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1162   };
1163 
1164   uint stk_args = 0;
1165   uint fp_args = 0;
1166 
1167   for (uint i = 0; i < total_args_passed; i++) {
1168     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1169     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1170     regs[i].set_pair(vmreg->next(next_val), vmreg);
1171   }
1172 
1173   return stk_args;
1174 }
1175 
1176 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1177   // We always ignore the frame_slots arg and just use the space just below frame pointer
1178   // which by this time is free to use
1179   switch (ret_type) {
1180   case T_FLOAT:
1181     __ movflt(Address(rbp, -wordSize), xmm0);
1182     break;
1183   case T_DOUBLE:
1184     __ movdbl(Address(rbp, -wordSize), xmm0);
1185     break;
1186   case T_VOID:  break;
1187   default: {
1188     __ movptr(Address(rbp, -wordSize), rax);
1189     }
1190   }
1191 }
1192 
1193 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1194   // We always ignore the frame_slots arg and just use the space just below frame pointer
1195   // which by this time is free to use
1196   switch (ret_type) {
1197   case T_FLOAT:
1198     __ movflt(xmm0, Address(rbp, -wordSize));
1199     break;
1200   case T_DOUBLE:
1201     __ movdbl(xmm0, Address(rbp, -wordSize));
1202     break;
1203   case T_VOID:  break;
1204   default: {
1205     __ movptr(rax, Address(rbp, -wordSize));
1206     }
1207   }
1208 }
1209 
1210 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1211     for ( int i = first_arg ; i < arg_count ; i++ ) {
1212       if (args[i].first()->is_Register()) {
1213         __ push(args[i].first()->as_Register());
1214       } else if (args[i].first()->is_XMMRegister()) {
1215         __ subptr(rsp, 2*wordSize);
1216         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1217       }
1218     }
1219 }
1220 
1221 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1222     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1223       if (args[i].first()->is_Register()) {
1224         __ pop(args[i].first()->as_Register());
1225       } else if (args[i].first()->is_XMMRegister()) {
1226         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1227         __ addptr(rsp, 2*wordSize);
1228       }
1229     }
1230 }
1231 
1232 // Different signatures may require very different orders for the move
1233 // to avoid clobbering other arguments.  There's no simple way to
1234 // order them safely.  Compute a safe order for issuing stores and
1235 // break any cycles in those stores.  This code is fairly general but
1236 // it's not necessary on the other platforms so we keep it in the
1237 // platform dependent code instead of moving it into a shared file.
1238 // (See bugs 7013347 & 7145024.)
1239 // Note that this code is specific to LP64.
1240 class ComputeMoveOrder: public StackObj {
1241   class MoveOperation: public ResourceObj {
1242     friend class ComputeMoveOrder;
1243    private:
1244     VMRegPair        _src;
1245     VMRegPair        _dst;
1246     int              _src_index;
1247     int              _dst_index;
1248     bool             _processed;
1249     MoveOperation*  _next;
1250     MoveOperation*  _prev;
1251 
1252     static int get_id(VMRegPair r) {
1253       return r.first()->value();
1254     }
1255 
1256    public:
1257     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1258       _src(src)
1259     , _dst(dst)
1260     , _src_index(src_index)
1261     , _dst_index(dst_index)
1262     , _processed(false)
1263     , _next(NULL)
1264     , _prev(NULL) {
1265     }
1266 
1267     VMRegPair src() const              { return _src; }
1268     int src_id() const                 { return get_id(src()); }
1269     int src_index() const              { return _src_index; }
1270     VMRegPair dst() const              { return _dst; }
1271     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1272     int dst_index() const              { return _dst_index; }
1273     int dst_id() const                 { return get_id(dst()); }
1274     MoveOperation* next() const       { return _next; }
1275     MoveOperation* prev() const       { return _prev; }
1276     void set_processed()               { _processed = true; }
1277     bool is_processed() const          { return _processed; }
1278 
1279     // insert
1280     void break_cycle(VMRegPair temp_register) {
1281       // create a new store following the last store
1282       // to move from the temp_register to the original
1283       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1284 
1285       // break the cycle of links and insert new_store at the end
1286       // break the reverse link.
1287       MoveOperation* p = prev();
1288       assert(p->next() == this, "must be");
1289       _prev = NULL;
1290       p->_next = new_store;
1291       new_store->_prev = p;
1292 
1293       // change the original store to save it's value in the temp.
1294       set_dst(-1, temp_register);
1295     }
1296 
1297     void link(GrowableArray<MoveOperation*>& killer) {
1298       // link this store in front the store that it depends on
1299       MoveOperation* n = killer.at_grow(src_id(), NULL);
1300       if (n != NULL) {
1301         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1302         _next = n;
1303         n->_prev = this;
1304       }
1305     }
1306   };
1307 
1308  private:
1309   GrowableArray<MoveOperation*> edges;
1310 
1311  public:
1312   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1313                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1314     // Move operations where the dest is the stack can all be
1315     // scheduled first since they can't interfere with the other moves.
1316     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1317       if (in_sig_bt[i] == T_ARRAY) {
1318         c_arg--;
1319         if (out_regs[c_arg].first()->is_stack() &&
1320             out_regs[c_arg + 1].first()->is_stack()) {
1321           arg_order.push(i);
1322           arg_order.push(c_arg);
1323         } else {
1324           if (out_regs[c_arg].first()->is_stack() ||
1325               in_regs[i].first() == out_regs[c_arg].first()) {
1326             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1327           } else {
1328             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1329           }
1330         }
1331       } else if (in_sig_bt[i] == T_VOID) {
1332         arg_order.push(i);
1333         arg_order.push(c_arg);
1334       } else {
1335         if (out_regs[c_arg].first()->is_stack() ||
1336             in_regs[i].first() == out_regs[c_arg].first()) {
1337           arg_order.push(i);
1338           arg_order.push(c_arg);
1339         } else {
1340           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1341         }
1342       }
1343     }
1344     // Break any cycles in the register moves and emit the in the
1345     // proper order.
1346     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1347     for (int i = 0; i < stores->length(); i++) {
1348       arg_order.push(stores->at(i)->src_index());
1349       arg_order.push(stores->at(i)->dst_index());
1350     }
1351  }
1352 
1353   // Collected all the move operations
1354   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1355     if (src.first() == dst.first()) return;
1356     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1357   }
1358 
1359   // Walk the edges breaking cycles between moves.  The result list
1360   // can be walked in order to produce the proper set of loads
1361   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1362     // Record which moves kill which values
1363     GrowableArray<MoveOperation*> killer;
1364     for (int i = 0; i < edges.length(); i++) {
1365       MoveOperation* s = edges.at(i);
1366       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1367       killer.at_put_grow(s->dst_id(), s, NULL);
1368     }
1369     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1370            "make sure temp isn't in the registers that are killed");
1371 
1372     // create links between loads and stores
1373     for (int i = 0; i < edges.length(); i++) {
1374       edges.at(i)->link(killer);
1375     }
1376 
1377     // at this point, all the move operations are chained together
1378     // in a doubly linked list.  Processing it backwards finds
1379     // the beginning of the chain, forwards finds the end.  If there's
1380     // a cycle it can be broken at any point,  so pick an edge and walk
1381     // backward until the list ends or we end where we started.
1382     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1383     for (int e = 0; e < edges.length(); e++) {
1384       MoveOperation* s = edges.at(e);
1385       if (!s->is_processed()) {
1386         MoveOperation* start = s;
1387         // search for the beginning of the chain or cycle
1388         while (start->prev() != NULL && start->prev() != s) {
1389           start = start->prev();
1390         }
1391         if (start->prev() == s) {
1392           start->break_cycle(temp_register);
1393         }
1394         // walk the chain forward inserting to store list
1395         while (start != NULL) {
1396           stores->append(start);
1397           start->set_processed();
1398           start = start->next();
1399         }
1400       }
1401     }
1402     return stores;
1403   }
1404 };
1405 
1406 static void verify_oop_args(MacroAssembler* masm,
1407                             const methodHandle& method,
1408                             const BasicType* sig_bt,
1409                             const VMRegPair* regs) {
1410   Register temp_reg = rbx;  // not part of any compiled calling seq
1411   if (VerifyOops) {
1412     for (int i = 0; i < method->size_of_parameters(); i++) {
1413       if (is_reference_type(sig_bt[i])) {
1414         VMReg r = regs[i].first();
1415         assert(r->is_valid(), "bad oop arg");
1416         if (r->is_stack()) {
1417           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1418           __ verify_oop(temp_reg);
1419         } else {
1420           __ verify_oop(r->as_Register());
1421         }
1422       }
1423     }
1424   }
1425 }
1426 
1427 static void gen_special_dispatch(MacroAssembler* masm,
1428                                  const methodHandle& method,
1429                                  const BasicType* sig_bt,
1430                                  const VMRegPair* regs) {
1431   verify_oop_args(masm, method, sig_bt, regs);
1432   vmIntrinsics::ID iid = method->intrinsic_id();
1433 
1434   // Now write the args into the outgoing interpreter space
1435   bool     has_receiver   = false;
1436   Register receiver_reg   = noreg;
1437   int      member_arg_pos = -1;
1438   Register member_reg     = noreg;
1439   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1440   if (ref_kind != 0) {
1441     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1442     member_reg = rbx;  // known to be free at this point
1443     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1444   } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1445     has_receiver = true;
1446   } else {
1447     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1448   }
1449 
1450   if (member_reg != noreg) {
1451     // Load the member_arg into register, if necessary.
1452     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1453     VMReg r = regs[member_arg_pos].first();
1454     if (r->is_stack()) {
1455       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1456     } else {
1457       // no data motion is needed
1458       member_reg = r->as_Register();
1459     }
1460   }
1461 
1462   if (has_receiver) {
1463     // Make sure the receiver is loaded into a register.
1464     assert(method->size_of_parameters() > 0, "oob");
1465     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1466     VMReg r = regs[0].first();
1467     assert(r->is_valid(), "bad receiver arg");
1468     if (r->is_stack()) {
1469       // Porting note:  This assumes that compiled calling conventions always
1470       // pass the receiver oop in a register.  If this is not true on some
1471       // platform, pick a temp and load the receiver from stack.
1472       fatal("receiver always in a register");
1473       receiver_reg = j_rarg0;  // known to be free at this point
1474       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1475     } else {
1476       // no data motion is needed
1477       receiver_reg = r->as_Register();
1478     }
1479   }
1480 
1481   // Figure out which address we are really jumping to:
1482   MethodHandles::generate_method_handle_dispatch(masm, iid,
1483                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1484 }
1485 
1486 // ---------------------------------------------------------------------------
1487 // Generate a native wrapper for a given method.  The method takes arguments
1488 // in the Java compiled code convention, marshals them to the native
1489 // convention (handlizes oops, etc), transitions to native, makes the call,
1490 // returns to java state (possibly blocking), unhandlizes any result and
1491 // returns.
1492 //
1493 // Critical native functions are a shorthand for the use of
1494 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1495 // functions.  The wrapper is expected to unpack the arguments before
1496 // passing them to the callee. Critical native functions leave the state _in_Java,
1497 // since they cannot stop for GC.
1498 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1499 // block and the check for pending exceptions it's impossible for them
1500 // to be thrown.
1501 //
1502 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1503                                                 const methodHandle& method,
1504                                                 int compile_id,
1505                                                 BasicType* in_sig_bt,
1506                                                 VMRegPair* in_regs,
1507                                                 BasicType ret_type) {
1508   if (method->is_method_handle_intrinsic()) {
1509     vmIntrinsics::ID iid = method->intrinsic_id();
1510     intptr_t start = (intptr_t)__ pc();
1511     int vep_offset = ((intptr_t)__ pc()) - start;
1512     gen_special_dispatch(masm,
1513                          method,
1514                          in_sig_bt,
1515                          in_regs);
1516     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1517     __ flush();
1518     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1519     return nmethod::new_native_nmethod(method,
1520                                        compile_id,
1521                                        masm->code(),
1522                                        vep_offset,
1523                                        frame_complete,
1524                                        stack_slots / VMRegImpl::slots_per_word,
1525                                        in_ByteSize(-1),
1526                                        in_ByteSize(-1),
1527                                        (OopMapSet*)NULL);
1528   }
1529   address native_func = method->native_function();
1530   assert(native_func != NULL, "must have function");
1531 
1532   // An OopMap for lock (and class if static)
1533   OopMapSet *oop_maps = new OopMapSet();
1534   intptr_t start = (intptr_t)__ pc();
1535 
1536   // We have received a description of where all the java arg are located
1537   // on entry to the wrapper. We need to convert these args to where
1538   // the jni function will expect them. To figure out where they go
1539   // we convert the java signature to a C signature by inserting
1540   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1541 
1542   const int total_in_args = method->size_of_parameters();
1543   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1544 
1545   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1546   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1547   BasicType* in_elem_bt = NULL;
1548 
1549   int argc = 0;
1550   out_sig_bt[argc++] = T_ADDRESS;
1551   if (method->is_static()) {
1552     out_sig_bt[argc++] = T_OBJECT;
1553   }
1554 
1555   for (int i = 0; i < total_in_args ; i++ ) {
1556     out_sig_bt[argc++] = in_sig_bt[i];
1557   }
1558 
1559   // Now figure out where the args must be stored and how much stack space
1560   // they require.
1561   int out_arg_slots;
1562   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1563 
1564   // Compute framesize for the wrapper.  We need to handlize all oops in
1565   // incoming registers
1566 
1567   // Calculate the total number of stack slots we will need.
1568 
1569   // First count the abi requirement plus all of the outgoing args
1570   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1571 
1572   // Now the space for the inbound oop handle area
1573   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1574 
1575   int oop_handle_offset = stack_slots;
1576   stack_slots += total_save_slots;
1577 
1578   // Now any space we need for handlizing a klass if static method
1579 
1580   int klass_slot_offset = 0;
1581   int klass_offset = -1;
1582   int lock_slot_offset = 0;
1583   bool is_static = false;
1584 
1585   if (method->is_static()) {
1586     klass_slot_offset = stack_slots;
1587     stack_slots += VMRegImpl::slots_per_word;
1588     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1589     is_static = true;
1590   }
1591 
1592   // Plus a lock if needed
1593 
1594   if (method->is_synchronized()) {
1595     lock_slot_offset = stack_slots;
1596     stack_slots += VMRegImpl::slots_per_word;
1597   }
1598 
1599   // Now a place (+2) to save return values or temp during shuffling
1600   // + 4 for return address (which we own) and saved rbp
1601   stack_slots += 6;
1602 
1603   // Ok The space we have allocated will look like:
1604   //
1605   //
1606   // FP-> |                     |
1607   //      |---------------------|
1608   //      | 2 slots for moves   |
1609   //      |---------------------|
1610   //      | lock box (if sync)  |
1611   //      |---------------------| <- lock_slot_offset
1612   //      | klass (if static)   |
1613   //      |---------------------| <- klass_slot_offset
1614   //      | oopHandle area      |
1615   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1616   //      | outbound memory     |
1617   //      | based arguments     |
1618   //      |                     |
1619   //      |---------------------|
1620   //      |                     |
1621   // SP-> | out_preserved_slots |
1622   //
1623   //
1624 
1625 
1626   // Now compute actual number of stack words we need rounding to make
1627   // stack properly aligned.
1628   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1629 
1630   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1631 
1632   // First thing make an ic check to see if we should even be here
1633 
1634   // We are free to use all registers as temps without saving them and
1635   // restoring them except rbp. rbp is the only callee save register
1636   // as far as the interpreter and the compiler(s) are concerned.
1637 
1638 
1639   const Register ic_reg = rax;
1640   const Register receiver = j_rarg0;
1641 
1642   Label hit;
1643   Label exception_pending;
1644 
1645   assert_different_registers(ic_reg, receiver, rscratch1);
1646   __ verify_oop(receiver);
1647   __ load_klass(rscratch1, receiver, rscratch2);
1648   __ cmpq(ic_reg, rscratch1);
1649   __ jcc(Assembler::equal, hit);
1650 
1651   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1652 
1653   // Verified entry point must be aligned
1654   __ align(8);
1655 
1656   __ bind(hit);
1657 
1658   int vep_offset = ((intptr_t)__ pc()) - start;
1659 
1660   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1661     Label L_skip_barrier;
1662     Register klass = r10;
1663     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1664     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1665 
1666     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1667 
1668     __ bind(L_skip_barrier);
1669   }
1670 
1671 #ifdef COMPILER1
1672   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1673   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1674     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1675   }
1676 #endif // COMPILER1
1677 
1678   // The instruction at the verified entry point must be 5 bytes or longer
1679   // because it can be patched on the fly by make_non_entrant. The stack bang
1680   // instruction fits that requirement.
1681 
1682   // Generate stack overflow check
1683   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1684 
1685   // Generate a new frame for the wrapper.
1686   __ enter();
1687   // -2 because return address is already present and so is saved rbp
1688   __ subptr(rsp, stack_size - 2*wordSize);
1689 
1690   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1691   bs->nmethod_entry_barrier(masm);
1692 
1693   // Frame is now completed as far as size and linkage.
1694   int frame_complete = ((intptr_t)__ pc()) - start;
1695 
1696     if (UseRTMLocking) {
1697       // Abort RTM transaction before calling JNI
1698       // because critical section will be large and will be
1699       // aborted anyway. Also nmethod could be deoptimized.
1700       __ xabort(0);
1701     }
1702 
1703 #ifdef ASSERT
1704     {
1705       Label L;
1706       __ mov(rax, rsp);
1707       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1708       __ cmpptr(rax, rsp);
1709       __ jcc(Assembler::equal, L);
1710       __ stop("improperly aligned stack");
1711       __ bind(L);
1712     }
1713 #endif /* ASSERT */
1714 
1715 
1716   // We use r14 as the oop handle for the receiver/klass
1717   // It is callee save so it survives the call to native
1718 
1719   const Register oop_handle_reg = r14;
1720 
1721   //
1722   // We immediately shuffle the arguments so that any vm call we have to
1723   // make from here on out (sync slow path, jvmti, etc.) we will have
1724   // captured the oops from our caller and have a valid oopMap for
1725   // them.
1726 
1727   // -----------------
1728   // The Grand Shuffle
1729 
1730   // The Java calling convention is either equal (linux) or denser (win64) than the
1731   // c calling convention. However the because of the jni_env argument the c calling
1732   // convention always has at least one more (and two for static) arguments than Java.
1733   // Therefore if we move the args from java -> c backwards then we will never have
1734   // a register->register conflict and we don't have to build a dependency graph
1735   // and figure out how to break any cycles.
1736   //
1737 
1738   // Record esp-based slot for receiver on stack for non-static methods
1739   int receiver_offset = -1;
1740 
1741   // This is a trick. We double the stack slots so we can claim
1742   // the oops in the caller's frame. Since we are sure to have
1743   // more args than the caller doubling is enough to make
1744   // sure we can capture all the incoming oop args from the
1745   // caller.
1746   //
1747   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1748 
1749   // Mark location of rbp (someday)
1750   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1751 
1752   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1753   // All inbound args are referenced based on rbp and all outbound args via rsp.
1754 
1755 
1756 #ifdef ASSERT
1757   bool reg_destroyed[RegisterImpl::number_of_registers];
1758   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1759   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1760     reg_destroyed[r] = false;
1761   }
1762   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1763     freg_destroyed[f] = false;
1764   }
1765 
1766 #endif /* ASSERT */
1767 
1768   // For JNI natives the incoming and outgoing registers are offset upwards.
1769   GrowableArray<int> arg_order(2 * total_in_args);
1770 
1771   VMRegPair tmp_vmreg;
1772   tmp_vmreg.set2(rbx->as_VMReg());
1773 
1774   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1775     arg_order.push(i);
1776     arg_order.push(c_arg);
1777   }
1778 
1779   int temploc = -1;
1780   for (int ai = 0; ai < arg_order.length(); ai += 2) {
1781     int i = arg_order.at(ai);
1782     int c_arg = arg_order.at(ai + 1);
1783     __ block_comment(err_msg("move %d -> %d", i, c_arg));
1784 #ifdef ASSERT
1785     if (in_regs[i].first()->is_Register()) {
1786       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1787     } else if (in_regs[i].first()->is_XMMRegister()) {
1788       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1789     }
1790     if (out_regs[c_arg].first()->is_Register()) {
1791       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1792     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1793       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1794     }
1795 #endif /* ASSERT */
1796     switch (in_sig_bt[i]) {
1797       case T_ARRAY:
1798       case T_OBJECT:
1799         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1800                     ((i == 0) && (!is_static)),
1801                     &receiver_offset);
1802         break;
1803       case T_VOID:
1804         break;
1805 
1806       case T_FLOAT:
1807         __ float_move(in_regs[i], out_regs[c_arg]);
1808           break;
1809 
1810       case T_DOUBLE:
1811         assert( i + 1 < total_in_args &&
1812                 in_sig_bt[i + 1] == T_VOID &&
1813                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1814         __ double_move(in_regs[i], out_regs[c_arg]);
1815         break;
1816 
1817       case T_LONG :
1818         __ long_move(in_regs[i], out_regs[c_arg]);
1819         break;
1820 
1821       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1822 
1823       default:
1824         __ move32_64(in_regs[i], out_regs[c_arg]);
1825     }
1826   }
1827 
1828   int c_arg;
1829 
1830   // Pre-load a static method's oop into r14.  Used both by locking code and
1831   // the normal JNI call code.
1832   // point c_arg at the first arg that is already loaded in case we
1833   // need to spill before we call out
1834   c_arg = total_c_args - total_in_args;
1835 
1836   if (method->is_static()) {
1837 
1838     //  load oop into a register
1839     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1840 
1841     // Now handlize the static class mirror it's known not-null.
1842     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1843     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1844 
1845     // Now get the handle
1846     __ lea(oop_handle_reg, Address(rsp, klass_offset));
1847     // store the klass handle as second argument
1848     __ movptr(c_rarg1, oop_handle_reg);
1849     // and protect the arg if we must spill
1850     c_arg--;
1851   }
1852 
1853   // Change state to native (we save the return address in the thread, since it might not
1854   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
1855   // points into the right code segment. It does not have to be the correct return pc.
1856   // We use the same pc/oopMap repeatedly when we call out
1857 
1858   intptr_t the_pc = (intptr_t) __ pc();
1859   oop_maps->add_gc_map(the_pc - start, map);
1860 
1861   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
1862 
1863 
1864   // We have all of the arguments setup at this point. We must not touch any register
1865   // argument registers at this point (what if we save/restore them there are no oop?
1866 
1867   {
1868     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
1869     // protect the args we've loaded
1870     save_args(masm, total_c_args, c_arg, out_regs);
1871     __ mov_metadata(c_rarg1, method());
1872     __ call_VM_leaf(
1873       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
1874       r15_thread, c_rarg1);
1875     restore_args(masm, total_c_args, c_arg, out_regs);
1876   }
1877 
1878   // RedefineClasses() tracing support for obsolete method entry
1879   if (log_is_enabled(Trace, redefine, class, obsolete)) {
1880     // protect the args we've loaded
1881     save_args(masm, total_c_args, c_arg, out_regs);
1882     __ mov_metadata(c_rarg1, method());
1883     __ call_VM_leaf(
1884       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
1885       r15_thread, c_rarg1);
1886     restore_args(masm, total_c_args, c_arg, out_regs);
1887   }
1888 
1889   // Lock a synchronized method
1890 
1891   // Register definitions used by locking and unlocking
1892 
1893   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
1894   const Register obj_reg  = rbx;  // Will contain the oop
1895   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
1896   const Register old_hdr  = r13;  // value of old header at unlock time
1897 
1898   Label slow_path_lock;
1899   Label lock_done;
1900 
1901   if (method->is_synchronized()) {
1902 
1903     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
1904 
1905     // Get the handle (the 2nd argument)
1906     __ mov(oop_handle_reg, c_rarg1);
1907 
1908     // Get address of the box
1909 
1910     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
1911 
1912     // Load the oop from the handle
1913     __ movptr(obj_reg, Address(oop_handle_reg, 0));
1914 
1915     if (!UseHeavyMonitors) {
1916       // Load immediate 1 into swap_reg %rax
1917       __ movl(swap_reg, 1);
1918 
1919       // Load (object->mark() | 1) into swap_reg %rax
1920       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1921 
1922       // Save (object->mark() | 1) into BasicLock's displaced header
1923       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
1924 
1925       // src -> dest iff dest == rax else rax <- dest
1926       __ lock();
1927       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1928       __ jcc(Assembler::equal, lock_done);
1929 
1930       // Hmm should this move to the slow path code area???
1931 
1932       // Test if the oopMark is an obvious stack pointer, i.e.,
1933       //  1) (mark & 3) == 0, and
1934       //  2) rsp <= mark < mark + os::pagesize()
1935       // These 3 tests can be done by evaluating the following
1936       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
1937       // assuming both stack pointer and pagesize have their
1938       // least significant 2 bits clear.
1939       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
1940 
1941       __ subptr(swap_reg, rsp);
1942       __ andptr(swap_reg, 3 - os::vm_page_size());
1943 
1944       // Save the test result, for recursive case, the result is zero
1945       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
1946       __ jcc(Assembler::notEqual, slow_path_lock);
1947     } else {
1948       __ jmp(slow_path_lock);
1949     }
1950 
1951     // Slow path will re-enter here
1952 
1953     __ bind(lock_done);
1954   }
1955 
1956   // Finally just about ready to make the JNI call
1957 
1958   // get JNIEnv* which is first argument to native
1959   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
1960 
1961   // Now set thread in native
1962   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
1963 
1964   __ call(RuntimeAddress(native_func));
1965 
1966   // Verify or restore cpu control state after JNI call
1967   __ restore_cpu_control_state_after_jni();
1968 
1969   // Unpack native results.
1970   switch (ret_type) {
1971   case T_BOOLEAN: __ c2bool(rax);            break;
1972   case T_CHAR   : __ movzwl(rax, rax);      break;
1973   case T_BYTE   : __ sign_extend_byte (rax); break;
1974   case T_SHORT  : __ sign_extend_short(rax); break;
1975   case T_INT    : /* nothing to do */        break;
1976   case T_DOUBLE :
1977   case T_FLOAT  :
1978     // Result is in xmm0 we'll save as needed
1979     break;
1980   case T_ARRAY:                 // Really a handle
1981   case T_OBJECT:                // Really a handle
1982       break; // can't de-handlize until after safepoint check
1983   case T_VOID: break;
1984   case T_LONG: break;
1985   default       : ShouldNotReachHere();
1986   }
1987 
1988   Label after_transition;
1989 
1990   // Switch thread to "native transition" state before reading the synchronization state.
1991   // This additional state is necessary because reading and testing the synchronization
1992   // state is not atomic w.r.t. GC, as this scenario demonstrates:
1993   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
1994   //     VM thread changes sync state to synchronizing and suspends threads for GC.
1995   //     Thread A is resumed to finish this native method, but doesn't block here since it
1996   //     didn't see any synchronization is progress, and escapes.
1997   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
1998 
1999   // Force this write out before the read below
2000   __ membar(Assembler::Membar_mask_bits(
2001               Assembler::LoadLoad | Assembler::LoadStore |
2002               Assembler::StoreLoad | Assembler::StoreStore));
2003 
2004   // check for safepoint operation in progress and/or pending suspend requests
2005   {
2006     Label Continue;
2007     Label slow_path;
2008 
2009     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2010 
2011     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2012     __ jcc(Assembler::equal, Continue);
2013     __ bind(slow_path);
2014 
2015     // Don't use call_VM as it will see a possible pending exception and forward it
2016     // and never return here preventing us from clearing _last_native_pc down below.
2017     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2018     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2019     // by hand.
2020     //
2021     __ vzeroupper();
2022     save_native_result(masm, ret_type, stack_slots);
2023     __ mov(c_rarg0, r15_thread);
2024     __ mov(r12, rsp); // remember sp
2025     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2026     __ andptr(rsp, -16); // align stack as required by ABI
2027     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2028     __ mov(rsp, r12); // restore sp
2029     __ reinit_heapbase();
2030     // Restore any method result value
2031     restore_native_result(masm, ret_type, stack_slots);
2032     __ bind(Continue);
2033   }
2034 
2035   // change thread state
2036   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2037   __ bind(after_transition);
2038 
2039   Label reguard;
2040   Label reguard_done;
2041   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2042   __ jcc(Assembler::equal, reguard);
2043   __ bind(reguard_done);
2044 
2045   // native result if any is live
2046 
2047   // Unlock
2048   Label unlock_done;
2049   Label slow_path_unlock;
2050   if (method->is_synchronized()) {
2051 
2052     // Get locked oop from the handle we passed to jni
2053     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2054 
2055     Label done;
2056 
2057     if (!UseHeavyMonitors) {
2058       // Simple recursive lock?
2059       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2060       __ jcc(Assembler::equal, done);
2061     }
2062 
2063     // Must save rax if it is live now because cmpxchg must use it
2064     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2065       save_native_result(masm, ret_type, stack_slots);
2066     }
2067 
2068 
2069     if (!UseHeavyMonitors) {
2070       // get address of the stack lock
2071       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2072       //  get old displaced header
2073       __ movptr(old_hdr, Address(rax, 0));
2074 
2075       // Atomic swap old header if oop still contains the stack lock
2076       __ lock();
2077       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2078       __ jcc(Assembler::notEqual, slow_path_unlock);
2079     } else {
2080       __ jmp(slow_path_unlock);
2081     }
2082 
2083     // slow path re-enters here
2084     __ bind(unlock_done);
2085     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2086       restore_native_result(masm, ret_type, stack_slots);
2087     }
2088 
2089     __ bind(done);
2090 
2091   }
2092   {
2093     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2094     save_native_result(masm, ret_type, stack_slots);
2095     __ mov_metadata(c_rarg1, method());
2096     __ call_VM_leaf(
2097          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2098          r15_thread, c_rarg1);
2099     restore_native_result(masm, ret_type, stack_slots);
2100   }
2101 
2102   __ reset_last_Java_frame(false);
2103 
2104   // Unbox oop result, e.g. JNIHandles::resolve value.
2105   if (is_reference_type(ret_type)) {
2106     __ resolve_jobject(rax /* value */,
2107                        r15_thread /* thread */,
2108                        rcx /* tmp */);
2109   }
2110 
2111   if (CheckJNICalls) {
2112     // clear_pending_jni_exception_check
2113     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2114   }
2115 
2116   // reset handle block
2117   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2118   __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2119 
2120   // pop our frame
2121 
2122   __ leave();
2123 
2124   // Any exception pending?
2125   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2126   __ jcc(Assembler::notEqual, exception_pending);
2127 
2128   // Return
2129 
2130   __ ret(0);
2131 
2132   // Unexpected paths are out of line and go here
2133 
2134   // forward the exception
2135   __ bind(exception_pending);
2136 
2137   // and forward the exception
2138   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2139 
2140   // Slow path locking & unlocking
2141   if (method->is_synchronized()) {
2142 
2143     // BEGIN Slow path lock
2144     __ bind(slow_path_lock);
2145 
2146     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2147     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2148 
2149     // protect the args we've loaded
2150     save_args(masm, total_c_args, c_arg, out_regs);
2151 
2152     __ mov(c_rarg0, obj_reg);
2153     __ mov(c_rarg1, lock_reg);
2154     __ mov(c_rarg2, r15_thread);
2155 
2156     // Not a leaf but we have last_Java_frame setup as we want
2157     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2158     restore_args(masm, total_c_args, c_arg, out_regs);
2159 
2160 #ifdef ASSERT
2161     { Label L;
2162     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2163     __ jcc(Assembler::equal, L);
2164     __ stop("no pending exception allowed on exit from monitorenter");
2165     __ bind(L);
2166     }
2167 #endif
2168     __ jmp(lock_done);
2169 
2170     // END Slow path lock
2171 
2172     // BEGIN Slow path unlock
2173     __ bind(slow_path_unlock);
2174 
2175     // If we haven't already saved the native result we must save it now as xmm registers
2176     // are still exposed.
2177     __ vzeroupper();
2178     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2179       save_native_result(masm, ret_type, stack_slots);
2180     }
2181 
2182     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2183 
2184     __ mov(c_rarg0, obj_reg);
2185     __ mov(c_rarg2, r15_thread);
2186     __ mov(r12, rsp); // remember sp
2187     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2188     __ andptr(rsp, -16); // align stack as required by ABI
2189 
2190     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2191     // NOTE that obj_reg == rbx currently
2192     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2193     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2194 
2195     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2196     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2197     __ mov(rsp, r12); // restore sp
2198     __ reinit_heapbase();
2199 #ifdef ASSERT
2200     {
2201       Label L;
2202       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2203       __ jcc(Assembler::equal, L);
2204       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2205       __ bind(L);
2206     }
2207 #endif /* ASSERT */
2208 
2209     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2210 
2211     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2212       restore_native_result(masm, ret_type, stack_slots);
2213     }
2214     __ jmp(unlock_done);
2215 
2216     // END Slow path unlock
2217 
2218   } // synchronized
2219 
2220   // SLOW PATH Reguard the stack if needed
2221 
2222   __ bind(reguard);
2223   __ vzeroupper();
2224   save_native_result(masm, ret_type, stack_slots);
2225   __ mov(r12, rsp); // remember sp
2226   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2227   __ andptr(rsp, -16); // align stack as required by ABI
2228   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2229   __ mov(rsp, r12); // restore sp
2230   __ reinit_heapbase();
2231   restore_native_result(masm, ret_type, stack_slots);
2232   // and continue
2233   __ jmp(reguard_done);
2234 
2235 
2236 
2237   __ flush();
2238 
2239   nmethod *nm = nmethod::new_native_nmethod(method,
2240                                             compile_id,
2241                                             masm->code(),
2242                                             vep_offset,
2243                                             frame_complete,
2244                                             stack_slots / VMRegImpl::slots_per_word,
2245                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2246                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2247                                             oop_maps);
2248 
2249   return nm;
2250 }
2251 
2252 // this function returns the adjust size (in number of words) to a c2i adapter
2253 // activation for use during deoptimization
2254 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2255   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2256 }
2257 
2258 
2259 uint SharedRuntime::out_preserve_stack_slots() {
2260   return 0;
2261 }
2262 
2263 
2264 // Number of stack slots between incoming argument block and the start of
2265 // a new frame.  The PROLOG must add this many slots to the stack.  The
2266 // EPILOG must remove this many slots.  amd64 needs two slots for
2267 // return address.
2268 uint SharedRuntime::in_preserve_stack_slots() {
2269   return 4 + 2 * VerifyStackAtCalls;
2270 }
2271 
2272 //------------------------------generate_deopt_blob----------------------------
2273 void SharedRuntime::generate_deopt_blob() {
2274   // Allocate space for the code
2275   ResourceMark rm;
2276   // Setup code generation tools
2277   int pad = 0;
2278   if (UseAVX > 2) {
2279     pad += 1024;
2280   }
2281 #if INCLUDE_JVMCI
2282   if (EnableJVMCI) {
2283     pad += 512; // Increase the buffer size when compiling for JVMCI
2284   }
2285 #endif
2286   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2287   MacroAssembler* masm = new MacroAssembler(&buffer);
2288   int frame_size_in_words;
2289   OopMap* map = NULL;
2290   OopMapSet *oop_maps = new OopMapSet();
2291 
2292   // -------------
2293   // This code enters when returning to a de-optimized nmethod.  A return
2294   // address has been pushed on the the stack, and return values are in
2295   // registers.
2296   // If we are doing a normal deopt then we were called from the patched
2297   // nmethod from the point we returned to the nmethod. So the return
2298   // address on the stack is wrong by NativeCall::instruction_size
2299   // We will adjust the value so it looks like we have the original return
2300   // address on the stack (like when we eagerly deoptimized).
2301   // In the case of an exception pending when deoptimizing, we enter
2302   // with a return address on the stack that points after the call we patched
2303   // into the exception handler. We have the following register state from,
2304   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2305   //    rax: exception oop
2306   //    rbx: exception handler
2307   //    rdx: throwing pc
2308   // So in this case we simply jam rdx into the useless return address and
2309   // the stack looks just like we want.
2310   //
2311   // At this point we need to de-opt.  We save the argument return
2312   // registers.  We call the first C routine, fetch_unroll_info().  This
2313   // routine captures the return values and returns a structure which
2314   // describes the current frame size and the sizes of all replacement frames.
2315   // The current frame is compiled code and may contain many inlined
2316   // functions, each with their own JVM state.  We pop the current frame, then
2317   // push all the new frames.  Then we call the C routine unpack_frames() to
2318   // populate these frames.  Finally unpack_frames() returns us the new target
2319   // address.  Notice that callee-save registers are BLOWN here; they have
2320   // already been captured in the vframeArray at the time the return PC was
2321   // patched.
2322   address start = __ pc();
2323   Label cont;
2324 
2325   // Prolog for non exception case!
2326 
2327   // Save everything in sight.
2328   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2329 
2330   // Normal deoptimization.  Save exec mode for unpack_frames.
2331   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2332   __ jmp(cont);
2333 
2334   int reexecute_offset = __ pc() - start;
2335 #if INCLUDE_JVMCI && !defined(COMPILER1)
2336   if (EnableJVMCI && UseJVMCICompiler) {
2337     // JVMCI does not use this kind of deoptimization
2338     __ should_not_reach_here();
2339   }
2340 #endif
2341 
2342   // Reexecute case
2343   // return address is the pc describes what bci to do re-execute at
2344 
2345   // No need to update map as each call to save_live_registers will produce identical oopmap
2346   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2347 
2348   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2349   __ jmp(cont);
2350 
2351 #if INCLUDE_JVMCI
2352   Label after_fetch_unroll_info_call;
2353   int implicit_exception_uncommon_trap_offset = 0;
2354   int uncommon_trap_offset = 0;
2355 
2356   if (EnableJVMCI) {
2357     implicit_exception_uncommon_trap_offset = __ pc() - start;
2358 
2359     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2360     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2361 
2362     uncommon_trap_offset = __ pc() - start;
2363 
2364     // Save everything in sight.
2365     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2366     // fetch_unroll_info needs to call last_java_frame()
2367     __ set_last_Java_frame(noreg, noreg, NULL);
2368 
2369     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2370     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2371 
2372     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2373     __ mov(c_rarg0, r15_thread);
2374     __ movl(c_rarg2, r14); // exec mode
2375     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2376     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2377 
2378     __ reset_last_Java_frame(false);
2379 
2380     __ jmp(after_fetch_unroll_info_call);
2381   } // EnableJVMCI
2382 #endif // INCLUDE_JVMCI
2383 
2384   int exception_offset = __ pc() - start;
2385 
2386   // Prolog for exception case
2387 
2388   // all registers are dead at this entry point, except for rax, and
2389   // rdx which contain the exception oop and exception pc
2390   // respectively.  Set them in TLS and fall thru to the
2391   // unpack_with_exception_in_tls entry point.
2392 
2393   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2394   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2395 
2396   int exception_in_tls_offset = __ pc() - start;
2397 
2398   // new implementation because exception oop is now passed in JavaThread
2399 
2400   // Prolog for exception case
2401   // All registers must be preserved because they might be used by LinearScan
2402   // Exceptiop oop and throwing PC are passed in JavaThread
2403   // tos: stack at point of call to method that threw the exception (i.e. only
2404   // args are on the stack, no return address)
2405 
2406   // make room on stack for the return address
2407   // It will be patched later with the throwing pc. The correct value is not
2408   // available now because loading it from memory would destroy registers.
2409   __ push(0);
2410 
2411   // Save everything in sight.
2412   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2413 
2414   // Now it is safe to overwrite any register
2415 
2416   // Deopt during an exception.  Save exec mode for unpack_frames.
2417   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2418 
2419   // load throwing pc from JavaThread and patch it as the return address
2420   // of the current frame. Then clear the field in JavaThread
2421 
2422   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2423   __ movptr(Address(rbp, wordSize), rdx);
2424   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2425 
2426 #ifdef ASSERT
2427   // verify that there is really an exception oop in JavaThread
2428   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2429   __ verify_oop(rax);
2430 
2431   // verify that there is no pending exception
2432   Label no_pending_exception;
2433   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2434   __ testptr(rax, rax);
2435   __ jcc(Assembler::zero, no_pending_exception);
2436   __ stop("must not have pending exception here");
2437   __ bind(no_pending_exception);
2438 #endif
2439 
2440   __ bind(cont);
2441 
2442   // Call C code.  Need thread and this frame, but NOT official VM entry
2443   // crud.  We cannot block on this call, no GC can happen.
2444   //
2445   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2446 
2447   // fetch_unroll_info needs to call last_java_frame().
2448 
2449   __ set_last_Java_frame(noreg, noreg, NULL);
2450 #ifdef ASSERT
2451   { Label L;
2452     __ cmpptr(Address(r15_thread,
2453                     JavaThread::last_Java_fp_offset()),
2454             (int32_t)0);
2455     __ jcc(Assembler::equal, L);
2456     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2457     __ bind(L);
2458   }
2459 #endif // ASSERT
2460   __ mov(c_rarg0, r15_thread);
2461   __ movl(c_rarg1, r14); // exec_mode
2462   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2463 
2464   // Need to have an oopmap that tells fetch_unroll_info where to
2465   // find any register it might need.
2466   oop_maps->add_gc_map(__ pc() - start, map);
2467 
2468   __ reset_last_Java_frame(false);
2469 
2470 #if INCLUDE_JVMCI
2471   if (EnableJVMCI) {
2472     __ bind(after_fetch_unroll_info_call);
2473   }
2474 #endif
2475 
2476   // Load UnrollBlock* into rdi
2477   __ mov(rdi, rax);
2478 
2479   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2480    Label noException;
2481   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2482   __ jcc(Assembler::notEqual, noException);
2483   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2484   // QQQ this is useless it was NULL above
2485   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2486   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2487   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2488 
2489   __ verify_oop(rax);
2490 
2491   // Overwrite the result registers with the exception results.
2492   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2493   // I think this is useless
2494   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2495 
2496   __ bind(noException);
2497 
2498   // Only register save data is on the stack.
2499   // Now restore the result registers.  Everything else is either dead
2500   // or captured in the vframeArray.
2501   RegisterSaver::restore_result_registers(masm);
2502 
2503   // All of the register save area has been popped of the stack. Only the
2504   // return address remains.
2505 
2506   // Pop all the frames we must move/replace.
2507   //
2508   // Frame picture (youngest to oldest)
2509   // 1: self-frame (no frame link)
2510   // 2: deopting frame  (no frame link)
2511   // 3: caller of deopting frame (could be compiled/interpreted).
2512   //
2513   // Note: by leaving the return address of self-frame on the stack
2514   // and using the size of frame 2 to adjust the stack
2515   // when we are done the return to frame 3 will still be on the stack.
2516 
2517   // Pop deoptimized frame
2518   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2519   __ addptr(rsp, rcx);
2520 
2521   // rsp should be pointing at the return address to the caller (3)
2522 
2523   // Pick up the initial fp we should save
2524   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2525   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2526 
2527 #ifdef ASSERT
2528   // Compilers generate code that bang the stack by as much as the
2529   // interpreter would need. So this stack banging should never
2530   // trigger a fault. Verify that it does not on non product builds.
2531   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2532   __ bang_stack_size(rbx, rcx);
2533 #endif
2534 
2535   // Load address of array of frame pcs into rcx
2536   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2537 
2538   // Trash the old pc
2539   __ addptr(rsp, wordSize);
2540 
2541   // Load address of array of frame sizes into rsi
2542   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2543 
2544   // Load counter into rdx
2545   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2546 
2547   // Now adjust the caller's stack to make up for the extra locals
2548   // but record the original sp so that we can save it in the skeletal interpreter
2549   // frame and the stack walking of interpreter_sender will get the unextended sp
2550   // value and not the "real" sp value.
2551 
2552   const Register sender_sp = r8;
2553 
2554   __ mov(sender_sp, rsp);
2555   __ movl(rbx, Address(rdi,
2556                        Deoptimization::UnrollBlock::
2557                        caller_adjustment_offset_in_bytes()));
2558   __ subptr(rsp, rbx);
2559 
2560   // Push interpreter frames in a loop
2561   Label loop;
2562   __ bind(loop);
2563   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2564   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2565   __ pushptr(Address(rcx, 0));          // Save return address
2566   __ enter();                           // Save old & set new ebp
2567   __ subptr(rsp, rbx);                  // Prolog
2568   // This value is corrected by layout_activation_impl
2569   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2570   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2571   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2572   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2573   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2574   __ decrementl(rdx);                   // Decrement counter
2575   __ jcc(Assembler::notZero, loop);
2576   __ pushptr(Address(rcx, 0));          // Save final return address
2577 
2578   // Re-push self-frame
2579   __ enter();                           // Save old & set new ebp
2580 
2581   // Allocate a full sized register save area.
2582   // Return address and rbp are in place, so we allocate two less words.
2583   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2584 
2585   // Restore frame locals after moving the frame
2586   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2587   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2588 
2589   // Call C code.  Need thread but NOT official VM entry
2590   // crud.  We cannot block on this call, no GC can happen.  Call should
2591   // restore return values to their stack-slots with the new SP.
2592   //
2593   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2594 
2595   // Use rbp because the frames look interpreted now
2596   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2597   // Don't need the precise return PC here, just precise enough to point into this code blob.
2598   address the_pc = __ pc();
2599   __ set_last_Java_frame(noreg, rbp, the_pc);
2600 
2601   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2602   __ mov(c_rarg0, r15_thread);
2603   __ movl(c_rarg1, r14); // second arg: exec_mode
2604   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2605   // Revert SP alignment after call since we're going to do some SP relative addressing below
2606   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2607 
2608   // Set an oopmap for the call site
2609   // Use the same PC we used for the last java frame
2610   oop_maps->add_gc_map(the_pc - start,
2611                        new OopMap( frame_size_in_words, 0 ));
2612 
2613   // Clear fp AND pc
2614   __ reset_last_Java_frame(true);
2615 
2616   // Collect return values
2617   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2618   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2619   // I think this is useless (throwing pc?)
2620   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2621 
2622   // Pop self-frame.
2623   __ leave();                           // Epilog
2624 
2625   // Jump to interpreter
2626   __ ret(0);
2627 
2628   // Make sure all code is generated
2629   masm->flush();
2630 
2631   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2632   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2633 #if INCLUDE_JVMCI
2634   if (EnableJVMCI) {
2635     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2636     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2637   }
2638 #endif
2639 }
2640 
2641 #ifdef COMPILER2
2642 //------------------------------generate_uncommon_trap_blob--------------------
2643 void SharedRuntime::generate_uncommon_trap_blob() {
2644   // Allocate space for the code
2645   ResourceMark rm;
2646   // Setup code generation tools
2647   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2648   MacroAssembler* masm = new MacroAssembler(&buffer);
2649 
2650   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2651 
2652   address start = __ pc();
2653 
2654   if (UseRTMLocking) {
2655     // Abort RTM transaction before possible nmethod deoptimization.
2656     __ xabort(0);
2657   }
2658 
2659   // Push self-frame.  We get here with a return address on the
2660   // stack, so rsp is 8-byte aligned until we allocate our frame.
2661   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2662 
2663   // No callee saved registers. rbp is assumed implicitly saved
2664   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2665 
2666   // compiler left unloaded_class_index in j_rarg0 move to where the
2667   // runtime expects it.
2668   __ movl(c_rarg1, j_rarg0);
2669 
2670   __ set_last_Java_frame(noreg, noreg, NULL);
2671 
2672   // Call C code.  Need thread but NOT official VM entry
2673   // crud.  We cannot block on this call, no GC can happen.  Call should
2674   // capture callee-saved registers as well as return values.
2675   // Thread is in rdi already.
2676   //
2677   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2678 
2679   __ mov(c_rarg0, r15_thread);
2680   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2681   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2682 
2683   // Set an oopmap for the call site
2684   OopMapSet* oop_maps = new OopMapSet();
2685   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2686 
2687   // location of rbp is known implicitly by the frame sender code
2688 
2689   oop_maps->add_gc_map(__ pc() - start, map);
2690 
2691   __ reset_last_Java_frame(false);
2692 
2693   // Load UnrollBlock* into rdi
2694   __ mov(rdi, rax);
2695 
2696 #ifdef ASSERT
2697   { Label L;
2698     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2699             (int32_t)Deoptimization::Unpack_uncommon_trap);
2700     __ jcc(Assembler::equal, L);
2701     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2702     __ bind(L);
2703   }
2704 #endif
2705 
2706   // Pop all the frames we must move/replace.
2707   //
2708   // Frame picture (youngest to oldest)
2709   // 1: self-frame (no frame link)
2710   // 2: deopting frame  (no frame link)
2711   // 3: caller of deopting frame (could be compiled/interpreted).
2712 
2713   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2714   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2715 
2716   // Pop deoptimized frame (int)
2717   __ movl(rcx, Address(rdi,
2718                        Deoptimization::UnrollBlock::
2719                        size_of_deoptimized_frame_offset_in_bytes()));
2720   __ addptr(rsp, rcx);
2721 
2722   // rsp should be pointing at the return address to the caller (3)
2723 
2724   // Pick up the initial fp we should save
2725   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2726   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2727 
2728 #ifdef ASSERT
2729   // Compilers generate code that bang the stack by as much as the
2730   // interpreter would need. So this stack banging should never
2731   // trigger a fault. Verify that it does not on non product builds.
2732   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2733   __ bang_stack_size(rbx, rcx);
2734 #endif
2735 
2736   // Load address of array of frame pcs into rcx (address*)
2737   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2738 
2739   // Trash the return pc
2740   __ addptr(rsp, wordSize);
2741 
2742   // Load address of array of frame sizes into rsi (intptr_t*)
2743   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2744 
2745   // Counter
2746   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2747 
2748   // Now adjust the caller's stack to make up for the extra locals but
2749   // record the original sp so that we can save it in the skeletal
2750   // interpreter frame and the stack walking of interpreter_sender
2751   // will get the unextended sp value and not the "real" sp value.
2752 
2753   const Register sender_sp = r8;
2754 
2755   __ mov(sender_sp, rsp);
2756   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2757   __ subptr(rsp, rbx);
2758 
2759   // Push interpreter frames in a loop
2760   Label loop;
2761   __ bind(loop);
2762   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2763   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2764   __ pushptr(Address(rcx, 0));     // Save return address
2765   __ enter();                      // Save old & set new rbp
2766   __ subptr(rsp, rbx);             // Prolog
2767   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2768             sender_sp);            // Make it walkable
2769   // This value is corrected by layout_activation_impl
2770   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2771   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
2772   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
2773   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
2774   __ decrementl(rdx);              // Decrement counter
2775   __ jcc(Assembler::notZero, loop);
2776   __ pushptr(Address(rcx, 0));     // Save final return address
2777 
2778   // Re-push self-frame
2779   __ enter();                 // Save old & set new rbp
2780   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2781                               // Prolog
2782 
2783   // Use rbp because the frames look interpreted now
2784   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2785   // Don't need the precise return PC here, just precise enough to point into this code blob.
2786   address the_pc = __ pc();
2787   __ set_last_Java_frame(noreg, rbp, the_pc);
2788 
2789   // Call C code.  Need thread but NOT official VM entry
2790   // crud.  We cannot block on this call, no GC can happen.  Call should
2791   // restore return values to their stack-slots with the new SP.
2792   // Thread is in rdi already.
2793   //
2794   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2795 
2796   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2797   __ mov(c_rarg0, r15_thread);
2798   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2799   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2800 
2801   // Set an oopmap for the call site
2802   // Use the same PC we used for the last java frame
2803   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2804 
2805   // Clear fp AND pc
2806   __ reset_last_Java_frame(true);
2807 
2808   // Pop self-frame.
2809   __ leave();                 // Epilog
2810 
2811   // Jump to interpreter
2812   __ ret(0);
2813 
2814   // Make sure all code is generated
2815   masm->flush();
2816 
2817   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
2818                                                  SimpleRuntimeFrame::framesize >> 1);
2819 }
2820 #endif // COMPILER2
2821 
2822 //------------------------------generate_handler_blob------
2823 //
2824 // Generate a special Compile2Runtime blob that saves all registers,
2825 // and setup oopmap.
2826 //
2827 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
2828   assert(StubRoutines::forward_exception_entry() != NULL,
2829          "must be generated before");
2830 
2831   ResourceMark rm;
2832   OopMapSet *oop_maps = new OopMapSet();
2833   OopMap* map;
2834 
2835   // Allocate space for the code.  Setup code generation tools.
2836   CodeBuffer buffer("handler_blob", 2048, 1024);
2837   MacroAssembler* masm = new MacroAssembler(&buffer);
2838 
2839   address start   = __ pc();
2840   address call_pc = NULL;
2841   int frame_size_in_words;
2842   bool cause_return = (poll_type == POLL_AT_RETURN);
2843   bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
2844 
2845   if (UseRTMLocking) {
2846     // Abort RTM transaction before calling runtime
2847     // because critical section will be large and will be
2848     // aborted anyway. Also nmethod could be deoptimized.
2849     __ xabort(0);
2850   }
2851 
2852   // Make room for return address (or push it again)
2853   if (!cause_return) {
2854     __ push(rbx);
2855   }
2856 
2857   // Save registers, fpu state, and flags
2858   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
2859 
2860   // The following is basically a call_VM.  However, we need the precise
2861   // address of the call in order to generate an oopmap. Hence, we do all the
2862   // work ourselves.
2863 
2864   __ set_last_Java_frame(noreg, noreg, NULL);
2865 
2866   // The return address must always be correct so that frame constructor never
2867   // sees an invalid pc.
2868 
2869   if (!cause_return) {
2870     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2871     // Additionally, rbx is a callee saved register and we can look at it later to determine
2872     // if someone changed the return address for us!
2873     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
2874     __ movptr(Address(rbp, wordSize), rbx);
2875   }
2876 
2877   // Do the call
2878   __ mov(c_rarg0, r15_thread);
2879   __ call(RuntimeAddress(call_ptr));
2880 
2881   // Set an oopmap for the call site.  This oopmap will map all
2882   // oop-registers and debug-info registers as callee-saved.  This
2883   // will allow deoptimization at this safepoint to find all possible
2884   // debug-info recordings, as well as let GC find all oops.
2885 
2886   oop_maps->add_gc_map( __ pc() - start, map);
2887 
2888   Label noException;
2889 
2890   __ reset_last_Java_frame(false);
2891 
2892   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
2893   __ jcc(Assembler::equal, noException);
2894 
2895   // Exception pending
2896 
2897   RegisterSaver::restore_live_registers(masm, save_vectors);
2898 
2899   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2900 
2901   // No exception case
2902   __ bind(noException);
2903 
2904   Label no_adjust;
2905 #ifdef ASSERT
2906   Label bail;
2907 #endif
2908   if (!cause_return) {
2909     Label no_prefix, not_special;
2910 
2911     // If our stashed return pc was modified by the runtime we avoid touching it
2912     __ cmpptr(rbx, Address(rbp, wordSize));
2913     __ jccb(Assembler::notEqual, no_adjust);
2914 
2915     // Skip over the poll instruction.
2916     // See NativeInstruction::is_safepoint_poll()
2917     // Possible encodings:
2918     //      85 00       test   %eax,(%rax)
2919     //      85 01       test   %eax,(%rcx)
2920     //      85 02       test   %eax,(%rdx)
2921     //      85 03       test   %eax,(%rbx)
2922     //      85 06       test   %eax,(%rsi)
2923     //      85 07       test   %eax,(%rdi)
2924     //
2925     //   41 85 00       test   %eax,(%r8)
2926     //   41 85 01       test   %eax,(%r9)
2927     //   41 85 02       test   %eax,(%r10)
2928     //   41 85 03       test   %eax,(%r11)
2929     //   41 85 06       test   %eax,(%r14)
2930     //   41 85 07       test   %eax,(%r15)
2931     //
2932     //      85 04 24    test   %eax,(%rsp)
2933     //   41 85 04 24    test   %eax,(%r12)
2934     //      85 45 00    test   %eax,0x0(%rbp)
2935     //   41 85 45 00    test   %eax,0x0(%r13)
2936 
2937     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
2938     __ jcc(Assembler::notEqual, no_prefix);
2939     __ addptr(rbx, 1);
2940     __ bind(no_prefix);
2941 #ifdef ASSERT
2942     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
2943 #endif
2944     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
2945     // r12/rsp 0x04
2946     // r13/rbp 0x05
2947     __ movzbq(rcx, Address(rbx, 1));
2948     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
2949     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
2950     __ cmpptr(rcx, 1);
2951     __ jcc(Assembler::above, not_special);
2952     __ addptr(rbx, 1);
2953     __ bind(not_special);
2954 #ifdef ASSERT
2955     // Verify the correct encoding of the poll we're about to skip.
2956     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
2957     __ jcc(Assembler::notEqual, bail);
2958     // Mask out the modrm bits
2959     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
2960     // rax encodes to 0, so if the bits are nonzero it's incorrect
2961     __ jcc(Assembler::notZero, bail);
2962 #endif
2963     // Adjust return pc forward to step over the safepoint poll instruction
2964     __ addptr(rbx, 2);
2965     __ movptr(Address(rbp, wordSize), rbx);
2966   }
2967 
2968   __ bind(no_adjust);
2969   // Normal exit, restore registers and exit.
2970   RegisterSaver::restore_live_registers(masm, save_vectors);
2971   __ ret(0);
2972 
2973 #ifdef ASSERT
2974   __ bind(bail);
2975   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
2976 #endif
2977 
2978   // Make sure all code is generated
2979   masm->flush();
2980 
2981   // Fill-out other meta info
2982   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
2983 }
2984 
2985 //
2986 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
2987 //
2988 // Generate a stub that calls into vm to find out the proper destination
2989 // of a java call. All the argument registers are live at this point
2990 // but since this is generic code we don't know what they are and the caller
2991 // must do any gc of the args.
2992 //
2993 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
2994   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
2995 
2996   // allocate space for the code
2997   ResourceMark rm;
2998 
2999   CodeBuffer buffer(name, 1200, 512);
3000   MacroAssembler* masm                = new MacroAssembler(&buffer);
3001 
3002   int frame_size_in_words;
3003 
3004   OopMapSet *oop_maps = new OopMapSet();
3005   OopMap* map = NULL;
3006 
3007   int start = __ offset();
3008 
3009   // No need to save vector registers since they are caller-saved anyway.
3010   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
3011 
3012   int frame_complete = __ offset();
3013 
3014   __ set_last_Java_frame(noreg, noreg, NULL);
3015 
3016   __ mov(c_rarg0, r15_thread);
3017 
3018   __ call(RuntimeAddress(destination));
3019 
3020 
3021   // Set an oopmap for the call site.
3022   // We need this not only for callee-saved registers, but also for volatile
3023   // registers that the compiler might be keeping live across a safepoint.
3024 
3025   oop_maps->add_gc_map( __ offset() - start, map);
3026 
3027   // rax contains the address we are going to jump to assuming no exception got installed
3028 
3029   // clear last_Java_sp
3030   __ reset_last_Java_frame(false);
3031   // check for pending exceptions
3032   Label pending;
3033   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3034   __ jcc(Assembler::notEqual, pending);
3035 
3036   // get the returned Method*
3037   __ get_vm_result_2(rbx, r15_thread);
3038   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3039 
3040   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3041 
3042   RegisterSaver::restore_live_registers(masm);
3043 
3044   // We are back the the original state on entry and ready to go.
3045 
3046   __ jmp(rax);
3047 
3048   // Pending exception after the safepoint
3049 
3050   __ bind(pending);
3051 
3052   RegisterSaver::restore_live_registers(masm);
3053 
3054   // exception pending => remove activation and forward to exception handler
3055 
3056   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3057 
3058   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3059   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3060 
3061   // -------------
3062   // make sure all code is generated
3063   masm->flush();
3064 
3065   // return the  blob
3066   // frame_size_words or bytes??
3067   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3068 }
3069 
3070 #ifdef COMPILER2
3071 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3072 
3073 class NativeInvokerGenerator : public StubCodeGenerator {
3074   address _call_target;
3075   int _shadow_space_bytes;
3076 
3077   const GrowableArray<VMReg>& _input_registers;
3078   const GrowableArray<VMReg>& _output_registers;
3079 
3080   int _frame_complete;
3081   int _framesize;
3082   OopMapSet* _oop_maps;
3083 public:
3084   NativeInvokerGenerator(CodeBuffer* buffer,
3085                          address call_target,
3086                          int shadow_space_bytes,
3087                          const GrowableArray<VMReg>& input_registers,
3088                          const GrowableArray<VMReg>& output_registers)
3089    : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3090      _call_target(call_target),
3091      _shadow_space_bytes(shadow_space_bytes),
3092      _input_registers(input_registers),
3093      _output_registers(output_registers),
3094      _frame_complete(0),
3095      _framesize(0),
3096      _oop_maps(NULL) {
3097     assert(_output_registers.length() <= 1
3098            || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3099 
3100   }
3101 
3102   void generate();
3103 
3104   int spill_size_in_bytes() const {
3105     if (_output_registers.length() == 0) {
3106       return 0;
3107     }
3108     VMReg reg = _output_registers.at(0);
3109     assert(reg->is_reg(), "must be a register");
3110     if (reg->is_Register()) {
3111       return 8;
3112     } else if (reg->is_XMMRegister()) {
3113       if (UseAVX >= 3) {
3114         return 64;
3115       } else if (UseAVX >= 1) {
3116         return 32;
3117       } else {
3118         return 16;
3119       }
3120     } else {
3121       ShouldNotReachHere();
3122     }
3123     return 0;
3124   }
3125 
3126   void spill_out_registers() {
3127     if (_output_registers.length() == 0) {
3128       return;
3129     }
3130     VMReg reg = _output_registers.at(0);
3131     assert(reg->is_reg(), "must be a register");
3132     MacroAssembler* masm = _masm;
3133     if (reg->is_Register()) {
3134       __ movptr(Address(rsp, 0), reg->as_Register());
3135     } else if (reg->is_XMMRegister()) {
3136       if (UseAVX >= 3) {
3137         __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3138       } else if (UseAVX >= 1) {
3139         __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3140       } else {
3141         __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3142       }
3143     } else {
3144       ShouldNotReachHere();
3145     }
3146   }
3147 
3148   void fill_out_registers() {
3149     if (_output_registers.length() == 0) {
3150       return;
3151     }
3152     VMReg reg = _output_registers.at(0);
3153     assert(reg->is_reg(), "must be a register");
3154     MacroAssembler* masm = _masm;
3155     if (reg->is_Register()) {
3156       __ movptr(reg->as_Register(), Address(rsp, 0));
3157     } else if (reg->is_XMMRegister()) {
3158       if (UseAVX >= 3) {
3159         __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3160       } else if (UseAVX >= 1) {
3161         __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3162       } else {
3163         __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3164       }
3165     } else {
3166       ShouldNotReachHere();
3167     }
3168   }
3169 
3170   int frame_complete() const {
3171     return _frame_complete;
3172   }
3173 
3174   int framesize() const {
3175     return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3176   }
3177 
3178   OopMapSet* oop_maps() const {
3179     return _oop_maps;
3180   }
3181 
3182 private:
3183 #ifdef ASSERT
3184 bool target_uses_register(VMReg reg) {
3185   return _input_registers.contains(reg) || _output_registers.contains(reg);
3186 }
3187 #endif
3188 };
3189 
3190 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3191                                                 int shadow_space_bytes,
3192                                                 const GrowableArray<VMReg>& input_registers,
3193                                                 const GrowableArray<VMReg>& output_registers) {
3194   int locs_size  = 64;
3195   CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3196   NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3197   g.generate();
3198   code.log_section_sizes("nep_invoker_blob");
3199 
3200   RuntimeStub* stub =
3201     RuntimeStub::new_runtime_stub("nep_invoker_blob",
3202                                   &code,
3203                                   g.frame_complete(),
3204                                   g.framesize(),
3205                                   g.oop_maps(), false);
3206   return stub;
3207 }
3208 
3209 void NativeInvokerGenerator::generate() {
3210   assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3211 
3212   enum layout {
3213     rbp_off,
3214     rbp_off2,
3215     return_off,
3216     return_off2,
3217     framesize // inclusive of return address
3218   };
3219 
3220   _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3221   assert(is_even(_framesize/2), "sp not 16-byte aligned");
3222 
3223   _oop_maps  = new OopMapSet();
3224   MacroAssembler* masm = _masm;
3225 
3226   address start = __ pc();
3227 
3228   __ enter();
3229 
3230   // return address and rbp are already in place
3231   __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3232 
3233   _frame_complete = __ pc() - start;
3234 
3235   address the_pc = __ pc();
3236 
3237   __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3238   OopMap* map = new OopMap(_framesize, 0);
3239   _oop_maps->add_gc_map(the_pc - start, map);
3240 
3241   // State transition
3242   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3243 
3244   __ call(RuntimeAddress(_call_target));
3245 
3246   __ restore_cpu_control_state_after_jni();
3247 
3248   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3249 
3250   // Force this write out before the read below
3251   __ membar(Assembler::Membar_mask_bits(
3252           Assembler::LoadLoad | Assembler::LoadStore |
3253           Assembler::StoreLoad | Assembler::StoreStore));
3254 
3255   Label L_after_safepoint_poll;
3256   Label L_safepoint_poll_slow_path;
3257 
3258   __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3259   __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3260   __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3261 
3262   __ bind(L_after_safepoint_poll);
3263 
3264   // change thread state
3265   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3266 
3267   __ block_comment("reguard stack check");
3268   Label L_reguard;
3269   Label L_after_reguard;
3270   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3271   __ jcc(Assembler::equal, L_reguard);
3272   __ bind(L_after_reguard);
3273 
3274   __ reset_last_Java_frame(r15_thread, true);
3275 
3276   __ leave(); // required for proper stackwalking of RuntimeStub frame
3277   __ ret(0);
3278 
3279   //////////////////////////////////////////////////////////////////////////////
3280 
3281   __ block_comment("{ L_safepoint_poll_slow_path");
3282   __ bind(L_safepoint_poll_slow_path);
3283   __ vzeroupper();
3284 
3285   spill_out_registers();
3286 
3287   __ mov(c_rarg0, r15_thread);
3288   __ mov(r12, rsp); // remember sp
3289   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3290   __ andptr(rsp, -16); // align stack as required by ABI
3291   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3292   __ mov(rsp, r12); // restore sp
3293   __ reinit_heapbase();
3294 
3295   fill_out_registers();
3296 
3297   __ jmp(L_after_safepoint_poll);
3298   __ block_comment("} L_safepoint_poll_slow_path");
3299 
3300   //////////////////////////////////////////////////////////////////////////////
3301 
3302   __ block_comment("{ L_reguard");
3303   __ bind(L_reguard);
3304   __ vzeroupper();
3305 
3306   spill_out_registers();
3307 
3308   __ mov(r12, rsp); // remember sp
3309   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3310   __ andptr(rsp, -16); // align stack as required by ABI
3311   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3312   __ mov(rsp, r12); // restore sp
3313   __ reinit_heapbase();
3314 
3315   fill_out_registers();
3316 
3317   __ jmp(L_after_reguard);
3318 
3319   __ block_comment("} L_reguard");
3320 
3321   //////////////////////////////////////////////////////////////////////////////
3322 
3323   __ flush();
3324 }
3325 #endif // COMPILER2
3326 
3327 //------------------------------Montgomery multiplication------------------------
3328 //
3329 
3330 #ifndef _WINDOWS
3331 
3332 // Subtract 0:b from carry:a.  Return carry.
3333 static julong
3334 sub(julong a[], julong b[], julong carry, long len) {
3335   long long i = 0, cnt = len;
3336   julong tmp;
3337   asm volatile("clc; "
3338                "0: ; "
3339                "mov (%[b], %[i], 8), %[tmp]; "
3340                "sbb %[tmp], (%[a], %[i], 8); "
3341                "inc %[i]; dec %[cnt]; "
3342                "jne 0b; "
3343                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3344                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3345                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3346                : "memory");
3347   return tmp;
3348 }
3349 
3350 // Multiply (unsigned) Long A by Long B, accumulating the double-
3351 // length result into the accumulator formed of T0, T1, and T2.
3352 #define MACC(A, B, T0, T1, T2)                                  \
3353 do {                                                            \
3354   unsigned long hi, lo;                                         \
3355   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3356            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3357            : "r"(A), "a"(B) : "cc");                            \
3358  } while(0)
3359 
3360 // As above, but add twice the double-length result into the
3361 // accumulator.
3362 #define MACC2(A, B, T0, T1, T2)                                 \
3363 do {                                                            \
3364   unsigned long hi, lo;                                         \
3365   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3366            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3367            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3368            : "r"(A), "a"(B) : "cc");                            \
3369  } while(0)
3370 
3371 #else //_WINDOWS
3372 
3373 static julong
3374 sub(julong a[], julong b[], julong carry, long len) {
3375   long i;
3376   julong tmp;
3377   unsigned char c = 1;
3378   for (i = 0; i < len; i++) {
3379     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3380     a[i] = tmp;
3381   }
3382   c = _addcarry_u64(c, carry, ~0, &tmp);
3383   return tmp;
3384 }
3385 
3386 // Multiply (unsigned) Long A by Long B, accumulating the double-
3387 // length result into the accumulator formed of T0, T1, and T2.
3388 #define MACC(A, B, T0, T1, T2)                          \
3389 do {                                                    \
3390   julong hi, lo;                            \
3391   lo = _umul128(A, B, &hi);                             \
3392   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3393   c = _addcarry_u64(c, hi, T1, &T1);                    \
3394   _addcarry_u64(c, T2, 0, &T2);                         \
3395  } while(0)
3396 
3397 // As above, but add twice the double-length result into the
3398 // accumulator.
3399 #define MACC2(A, B, T0, T1, T2)                         \
3400 do {                                                    \
3401   julong hi, lo;                            \
3402   lo = _umul128(A, B, &hi);                             \
3403   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3404   c = _addcarry_u64(c, hi, T1, &T1);                    \
3405   _addcarry_u64(c, T2, 0, &T2);                         \
3406   c = _addcarry_u64(0, lo, T0, &T0);                    \
3407   c = _addcarry_u64(c, hi, T1, &T1);                    \
3408   _addcarry_u64(c, T2, 0, &T2);                         \
3409  } while(0)
3410 
3411 #endif //_WINDOWS
3412 
3413 // Fast Montgomery multiplication.  The derivation of the algorithm is
3414 // in  A Cryptographic Library for the Motorola DSP56000,
3415 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3416 
3417 static void NOINLINE
3418 montgomery_multiply(julong a[], julong b[], julong n[],
3419                     julong m[], julong inv, int len) {
3420   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3421   int i;
3422 
3423   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3424 
3425   for (i = 0; i < len; i++) {
3426     int j;
3427     for (j = 0; j < i; j++) {
3428       MACC(a[j], b[i-j], t0, t1, t2);
3429       MACC(m[j], n[i-j], t0, t1, t2);
3430     }
3431     MACC(a[i], b[0], t0, t1, t2);
3432     m[i] = t0 * inv;
3433     MACC(m[i], n[0], t0, t1, t2);
3434 
3435     assert(t0 == 0, "broken Montgomery multiply");
3436 
3437     t0 = t1; t1 = t2; t2 = 0;
3438   }
3439 
3440   for (i = len; i < 2*len; i++) {
3441     int j;
3442     for (j = i-len+1; j < len; j++) {
3443       MACC(a[j], b[i-j], t0, t1, t2);
3444       MACC(m[j], n[i-j], t0, t1, t2);
3445     }
3446     m[i-len] = t0;
3447     t0 = t1; t1 = t2; t2 = 0;
3448   }
3449 
3450   while (t0)
3451     t0 = sub(m, n, t0, len);
3452 }
3453 
3454 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3455 // multiplies so it should be up to 25% faster than Montgomery
3456 // multiplication.  However, its loop control is more complex and it
3457 // may actually run slower on some machines.
3458 
3459 static void NOINLINE
3460 montgomery_square(julong a[], julong n[],
3461                   julong m[], julong inv, int len) {
3462   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3463   int i;
3464 
3465   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3466 
3467   for (i = 0; i < len; i++) {
3468     int j;
3469     int end = (i+1)/2;
3470     for (j = 0; j < end; j++) {
3471       MACC2(a[j], a[i-j], t0, t1, t2);
3472       MACC(m[j], n[i-j], t0, t1, t2);
3473     }
3474     if ((i & 1) == 0) {
3475       MACC(a[j], a[j], t0, t1, t2);
3476     }
3477     for (; j < i; j++) {
3478       MACC(m[j], n[i-j], t0, t1, t2);
3479     }
3480     m[i] = t0 * inv;
3481     MACC(m[i], n[0], t0, t1, t2);
3482 
3483     assert(t0 == 0, "broken Montgomery square");
3484 
3485     t0 = t1; t1 = t2; t2 = 0;
3486   }
3487 
3488   for (i = len; i < 2*len; i++) {
3489     int start = i-len+1;
3490     int end = start + (len - start)/2;
3491     int j;
3492     for (j = start; j < end; j++) {
3493       MACC2(a[j], a[i-j], t0, t1, t2);
3494       MACC(m[j], n[i-j], t0, t1, t2);
3495     }
3496     if ((i & 1) == 0) {
3497       MACC(a[j], a[j], t0, t1, t2);
3498     }
3499     for (; j < len; j++) {
3500       MACC(m[j], n[i-j], t0, t1, t2);
3501     }
3502     m[i-len] = t0;
3503     t0 = t1; t1 = t2; t2 = 0;
3504   }
3505 
3506   while (t0)
3507     t0 = sub(m, n, t0, len);
3508 }
3509 
3510 // Swap words in a longword.
3511 static julong swap(julong x) {
3512   return (x << 32) | (x >> 32);
3513 }
3514 
3515 // Copy len longwords from s to d, word-swapping as we go.  The
3516 // destination array is reversed.
3517 static void reverse_words(julong *s, julong *d, int len) {
3518   d += len;
3519   while(len-- > 0) {
3520     d--;
3521     *d = swap(*s);
3522     s++;
3523   }
3524 }
3525 
3526 // The threshold at which squaring is advantageous was determined
3527 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3528 #define MONTGOMERY_SQUARING_THRESHOLD 64
3529 
3530 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3531                                         jint len, jlong inv,
3532                                         jint *m_ints) {
3533   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3534   int longwords = len/2;
3535 
3536   // Make very sure we don't use so much space that the stack might
3537   // overflow.  512 jints corresponds to an 16384-bit integer and
3538   // will use here a total of 8k bytes of stack space.
3539   int divisor = sizeof(julong) * 4;
3540   guarantee(longwords <= 8192 / divisor, "must be");
3541   int total_allocation = longwords * sizeof (julong) * 4;
3542   julong *scratch = (julong *)alloca(total_allocation);
3543 
3544   // Local scratch arrays
3545   julong
3546     *a = scratch + 0 * longwords,
3547     *b = scratch + 1 * longwords,
3548     *n = scratch + 2 * longwords,
3549     *m = scratch + 3 * longwords;
3550 
3551   reverse_words((julong *)a_ints, a, longwords);
3552   reverse_words((julong *)b_ints, b, longwords);
3553   reverse_words((julong *)n_ints, n, longwords);
3554 
3555   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3556 
3557   reverse_words(m, (julong *)m_ints, longwords);
3558 }
3559 
3560 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3561                                       jint len, jlong inv,
3562                                       jint *m_ints) {
3563   assert(len % 2 == 0, "array length in montgomery_square must be even");
3564   int longwords = len/2;
3565 
3566   // Make very sure we don't use so much space that the stack might
3567   // overflow.  512 jints corresponds to an 16384-bit integer and
3568   // will use here a total of 6k bytes of stack space.
3569   int divisor = sizeof(julong) * 3;
3570   guarantee(longwords <= (8192 / divisor), "must be");
3571   int total_allocation = longwords * sizeof (julong) * 3;
3572   julong *scratch = (julong *)alloca(total_allocation);
3573 
3574   // Local scratch arrays
3575   julong
3576     *a = scratch + 0 * longwords,
3577     *n = scratch + 1 * longwords,
3578     *m = scratch + 2 * longwords;
3579 
3580   reverse_words((julong *)a_ints, a, longwords);
3581   reverse_words((julong *)n_ints, n, longwords);
3582 
3583   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3584     ::montgomery_square(a, n, m, (julong)inv, longwords);
3585   } else {
3586     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3587   }
3588 
3589   reverse_words(m, (julong *)m_ints, longwords);
3590 }
3591 
3592 #ifdef COMPILER2
3593 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3594 //
3595 //------------------------------generate_exception_blob---------------------------
3596 // creates exception blob at the end
3597 // Using exception blob, this code is jumped from a compiled method.
3598 // (see emit_exception_handler in x86_64.ad file)
3599 //
3600 // Given an exception pc at a call we call into the runtime for the
3601 // handler in this method. This handler might merely restore state
3602 // (i.e. callee save registers) unwind the frame and jump to the
3603 // exception handler for the nmethod if there is no Java level handler
3604 // for the nmethod.
3605 //
3606 // This code is entered with a jmp.
3607 //
3608 // Arguments:
3609 //   rax: exception oop
3610 //   rdx: exception pc
3611 //
3612 // Results:
3613 //   rax: exception oop
3614 //   rdx: exception pc in caller or ???
3615 //   destination: exception handler of caller
3616 //
3617 // Note: the exception pc MUST be at a call (precise debug information)
3618 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3619 //
3620 
3621 void OptoRuntime::generate_exception_blob() {
3622   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3623   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3624   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3625 
3626   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3627 
3628   // Allocate space for the code
3629   ResourceMark rm;
3630   // Setup code generation tools
3631   CodeBuffer buffer("exception_blob", 2048, 1024);
3632   MacroAssembler* masm = new MacroAssembler(&buffer);
3633 
3634 
3635   address start = __ pc();
3636 
3637   // Exception pc is 'return address' for stack walker
3638   __ push(rdx);
3639   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3640 
3641   // Save callee-saved registers.  See x86_64.ad.
3642 
3643   // rbp is an implicitly saved callee saved register (i.e., the calling
3644   // convention will save/restore it in the prolog/epilog). Other than that
3645   // there are no callee save registers now that adapter frames are gone.
3646 
3647   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3648 
3649   // Store exception in Thread object. We cannot pass any arguments to the
3650   // handle_exception call, since we do not want to make any assumption
3651   // about the size of the frame where the exception happened in.
3652   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3653   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3654   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3655 
3656   // This call does all the hard work.  It checks if an exception handler
3657   // exists in the method.
3658   // If so, it returns the handler address.
3659   // If not, it prepares for stack-unwinding, restoring the callee-save
3660   // registers of the frame being removed.
3661   //
3662   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3663 
3664   // At a method handle call, the stack may not be properly aligned
3665   // when returning with an exception.
3666   address the_pc = __ pc();
3667   __ set_last_Java_frame(noreg, noreg, the_pc);
3668   __ mov(c_rarg0, r15_thread);
3669   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3670   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3671 
3672   // Set an oopmap for the call site.  This oopmap will only be used if we
3673   // are unwinding the stack.  Hence, all locations will be dead.
3674   // Callee-saved registers will be the same as the frame above (i.e.,
3675   // handle_exception_stub), since they were restored when we got the
3676   // exception.
3677 
3678   OopMapSet* oop_maps = new OopMapSet();
3679 
3680   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3681 
3682   __ reset_last_Java_frame(false);
3683 
3684   // Restore callee-saved registers
3685 
3686   // rbp is an implicitly saved callee-saved register (i.e., the calling
3687   // convention will save restore it in prolog/epilog) Other than that
3688   // there are no callee save registers now that adapter frames are gone.
3689 
3690   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3691 
3692   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3693   __ pop(rdx);                  // No need for exception pc anymore
3694 
3695   // rax: exception handler
3696 
3697   // We have a handler in rax (could be deopt blob).
3698   __ mov(r8, rax);
3699 
3700   // Get the exception oop
3701   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3702   // Get the exception pc in case we are deoptimized
3703   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3704 #ifdef ASSERT
3705   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3706   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3707 #endif
3708   // Clear the exception oop so GC no longer processes it as a root.
3709   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3710 
3711   // rax: exception oop
3712   // r8:  exception handler
3713   // rdx: exception pc
3714   // Jump to handler
3715 
3716   __ jmp(r8);
3717 
3718   // Make sure all code is generated
3719   masm->flush();
3720 
3721   // Set exception blob
3722   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3723 }
3724 #endif // COMPILER2
3725 
3726 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
3727                                        int total_in_args, const VMRegPair* in_regs,
3728                                        int total_out_args, VMRegPair* out_regs,
3729                                        GrowableArray<int>& arg_order,
3730                                        VMRegPair tmp_vmreg) {
3731   ComputeMoveOrder order(total_in_args, in_regs,
3732                          total_out_args, out_regs,
3733                          in_sig_bt, arg_order, tmp_vmreg);
3734 }