1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/icBuffer.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/compiledICHolder.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/jniHandles.hpp"
  48 #include "runtime/safepointMechanism.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/signature.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "runtime/vframeArray.hpp"
  53 #include "runtime/vm_version.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/formatBuffer.hpp"
  56 #include "vmreg_x86.inline.hpp"
  57 #ifdef COMPILER1
  58 #include "c1/c1_Runtime1.hpp"
  59 #endif
  60 #ifdef COMPILER2
  61 #include "opto/runtime.hpp"
  62 #endif
  63 #if INCLUDE_JVMCI
  64 #include "jvmci/jvmciJavaClasses.hpp"
  65 #endif
  66 
  67 #define __ masm->
  68 
  69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  70 
  71 class SimpleRuntimeFrame {
  72 
  73   public:
  74 
  75   // Most of the runtime stubs have this simple frame layout.
  76   // This class exists to make the layout shared in one place.
  77   // Offsets are for compiler stack slots, which are jints.
  78   enum layout {
  79     // The frame sender code expects that rbp will be in the "natural" place and
  80     // will override any oopMap setting for it. We must therefore force the layout
  81     // so that it agrees with the frame sender code.
  82     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  83     rbp_off2,
  84     return_off, return_off2,
  85     framesize
  86   };
  87 };
  88 
  89 class RegisterSaver {
  90   // Capture info about frame layout.  Layout offsets are in jint
  91   // units because compiler frame slots are jints.
  92 #define XSAVE_AREA_BEGIN 160
  93 #define XSAVE_AREA_YMM_BEGIN 576
  94 #define XSAVE_AREA_OPMASK_BEGIN 1088
  95 #define XSAVE_AREA_ZMM_BEGIN 1152
  96 #define XSAVE_AREA_UPPERBANK 1664
  97 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  98 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  99 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 100 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 102   enum layout {
 103     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 104     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 105     DEF_XMM_OFFS(0),
 106     DEF_XMM_OFFS(1),
 107     // 2..15 are implied in range usage
 108     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 109     DEF_YMM_OFFS(0),
 110     DEF_YMM_OFFS(1),
 111     // 2..15 are implied in range usage
 112     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 113     DEF_OPMASK_OFFS(0),
 114     DEF_OPMASK_OFFS(1),
 115     // 2..7 are implied in range usage
 116     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 117     DEF_ZMM_OFFS(0),
 118     DEF_ZMM_OFFS(1),
 119     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 120     DEF_ZMM_UPPER_OFFS(16),
 121     DEF_ZMM_UPPER_OFFS(17),
 122     // 18..31 are implied in range usage
 123     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 124     fpu_stateH_end,
 125     r15_off, r15H_off,
 126     r14_off, r14H_off,
 127     r13_off, r13H_off,
 128     r12_off, r12H_off,
 129     r11_off, r11H_off,
 130     r10_off, r10H_off,
 131     r9_off,  r9H_off,
 132     r8_off,  r8H_off,
 133     rdi_off, rdiH_off,
 134     rsi_off, rsiH_off,
 135     ignore_off, ignoreH_off,  // extra copy of rbp
 136     rsp_off, rspH_off,
 137     rbx_off, rbxH_off,
 138     rdx_off, rdxH_off,
 139     rcx_off, rcxH_off,
 140     rax_off, raxH_off,
 141     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 142     align_off, alignH_off,
 143     flags_off, flagsH_off,
 144     // The frame sender code expects that rbp will be in the "natural" place and
 145     // will override any oopMap setting for it. We must therefore force the layout
 146     // so that it agrees with the frame sender code.
 147     rbp_off, rbpH_off,        // copy of rbp we will restore
 148     return_off, returnH_off,  // slot for return address
 149     reg_save_size             // size in compiler stack slots
 150   };
 151 
 152  public:
 153   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors);
 154   static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 155 
 156   // Offsets into the register save area
 157   // Used by deoptimization when it is managing result register
 158   // values on its own
 159 
 160   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 161   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 162   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 163   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 164   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 165 
 166   // During deoptimization only the result registers need to be restored,
 167   // all the other values have already been extracted.
 168   static void restore_result_registers(MacroAssembler* masm);
 169 };
 170 
 171 // Register is a class, but it would be assigned numerical value.
 172 // "0" is assigned for rax. Thus we need to ignore -Wnonnull.
 173 PRAGMA_DIAG_PUSH
 174 PRAGMA_NONNULL_IGNORED
 175 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
 176   int off = 0;
 177   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 178   if (UseAVX < 3) {
 179     num_xmm_regs = num_xmm_regs/2;
 180   }
 181 #if COMPILER2_OR_JVMCI
 182   if (save_vectors && UseAVX == 0) {
 183     save_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 184   }
 185   assert(!save_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 186 #else
 187   save_vectors = false; // vectors are generated only by C2 and JVMCI
 188 #endif
 189 
 190   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 191   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 192   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 193   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 194   // CodeBlob frame size is in words.
 195   int frame_size_in_words = frame_size_in_bytes / wordSize;
 196   *total_frame_words = frame_size_in_words;
 197 
 198   // Save registers, fpu state, and flags.
 199   // We assume caller has already pushed the return address onto the
 200   // stack, so rsp is 8-byte aligned here.
 201   // We push rpb twice in this sequence because we want the real rbp
 202   // to be under the return like a normal enter.
 203 
 204   __ enter();          // rsp becomes 16-byte aligned here
 205   __ push_CPU_state(); // Push a multiple of 16 bytes
 206 
 207   // push cpu state handles this on EVEX enabled targets
 208   if (save_vectors) {
 209     // Save upper half of YMM registers(0..15)
 210     int base_addr = XSAVE_AREA_YMM_BEGIN;
 211     for (int n = 0; n < 16; n++) {
 212       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 213     }
 214     if (VM_Version::supports_evex()) {
 215       // Save upper half of ZMM registers(0..15)
 216       base_addr = XSAVE_AREA_ZMM_BEGIN;
 217       for (int n = 0; n < 16; n++) {
 218         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 219       }
 220       // Save full ZMM registers(16..num_xmm_regs)
 221       base_addr = XSAVE_AREA_UPPERBANK;
 222       off = 0;
 223       int vector_len = Assembler::AVX_512bit;
 224       for (int n = 16; n < num_xmm_regs; n++) {
 225         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 226       }
 227 #if COMPILER2_OR_JVMCI
 228       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 229       off = 0;
 230       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 231         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 232       }
 233 #endif
 234     }
 235   } else {
 236     if (VM_Version::supports_evex()) {
 237       // Save upper bank of ZMM registers(16..31) for double/float usage
 238       int base_addr = XSAVE_AREA_UPPERBANK;
 239       off = 0;
 240       for (int n = 16; n < num_xmm_regs; n++) {
 241         __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
 242       }
 243 #if COMPILER2_OR_JVMCI
 244       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 245       off = 0;
 246       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 247         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 248       }
 249 #endif
 250     }
 251   }
 252   __ vzeroupper();
 253   if (frame::arg_reg_save_area_bytes != 0) {
 254     // Allocate argument register save area
 255     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 256   }
 257 
 258   // Set an oopmap for the call site.  This oopmap will map all
 259   // oop-registers and debug-info registers as callee-saved.  This
 260   // will allow deoptimization at this safepoint to find all possible
 261   // debug-info recordings, as well as let GC find all oops.
 262 
 263   OopMapSet *oop_maps = new OopMapSet();
 264   OopMap* map = new OopMap(frame_size_in_slots, 0);
 265 
 266 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 267 
 268   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 271   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 272   // rbp location is known implicitly by the frame sender code, needs no oopmap
 273   // and the location where rbp was saved by is ignored
 274   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 283   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 284   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 285   // on EVEX enabled targets, we get it included in the xsave area
 286   off = xmm0_off;
 287   int delta = xmm1_off - off;
 288   for (int n = 0; n < 16; n++) {
 289     XMMRegister xmm_name = as_XMMRegister(n);
 290     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 291     off += delta;
 292   }
 293   if (UseAVX > 2) {
 294     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 295     off = zmm16_off;
 296     delta = zmm17_off - off;
 297     for (int n = 16; n < num_xmm_regs; n++) {
 298       XMMRegister zmm_name = as_XMMRegister(n);
 299       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 300       off += delta;
 301     }
 302   }
 303 
 304 #if COMPILER2_OR_JVMCI
 305   if (save_vectors) {
 306     // Save upper half of YMM registers(0..15)
 307     off = ymm0_off;
 308     delta = ymm1_off - ymm0_off;
 309     for (int n = 0; n < 16; n++) {
 310       XMMRegister ymm_name = as_XMMRegister(n);
 311       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 312       off += delta;
 313     }
 314     if (VM_Version::supports_evex()) {
 315       // Save upper half of ZMM registers(0..15)
 316       off = zmm0_off;
 317       delta = zmm1_off - zmm0_off;
 318       for (int n = 0; n < 16; n++) {
 319         XMMRegister zmm_name = as_XMMRegister(n);
 320         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 321         off += delta;
 322       }
 323     }
 324   }
 325 #endif // COMPILER2_OR_JVMCI
 326 
 327   // %%% These should all be a waste but we'll keep things as they were for now
 328   if (true) {
 329     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 333     // rbp location is known implicitly by the frame sender code, needs no oopmap
 334     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 343     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 344     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 345     // on EVEX enabled targets, we get it included in the xsave area
 346     off = xmm0H_off;
 347     delta = xmm1H_off - off;
 348     for (int n = 0; n < 16; n++) {
 349       XMMRegister xmm_name = as_XMMRegister(n);
 350       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 351       off += delta;
 352     }
 353     if (UseAVX > 2) {
 354       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 355       off = zmm16H_off;
 356       delta = zmm17H_off - off;
 357       for (int n = 16; n < num_xmm_regs; n++) {
 358         XMMRegister zmm_name = as_XMMRegister(n);
 359         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 360         off += delta;
 361       }
 362     }
 363   }
 364 
 365   return map;
 366 }
 367 PRAGMA_DIAG_POP
 368 
 369 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
 370   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 371   if (UseAVX < 3) {
 372     num_xmm_regs = num_xmm_regs/2;
 373   }
 374   if (frame::arg_reg_save_area_bytes != 0) {
 375     // Pop arg register save area
 376     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 377   }
 378 
 379 #if COMPILER2_OR_JVMCI
 380   if (restore_vectors) {
 381     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 382     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 383   }
 384 #else
 385   assert(!restore_vectors, "vectors are generated only by C2");
 386 #endif
 387 
 388   __ vzeroupper();
 389 
 390   // On EVEX enabled targets everything is handled in pop fpu state
 391   if (restore_vectors) {
 392     // Restore upper half of YMM registers (0..15)
 393     int base_addr = XSAVE_AREA_YMM_BEGIN;
 394     for (int n = 0; n < 16; n++) {
 395       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 396     }
 397     if (VM_Version::supports_evex()) {
 398       // Restore upper half of ZMM registers (0..15)
 399       base_addr = XSAVE_AREA_ZMM_BEGIN;
 400       for (int n = 0; n < 16; n++) {
 401         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 402       }
 403       // Restore full ZMM registers(16..num_xmm_regs)
 404       base_addr = XSAVE_AREA_UPPERBANK;
 405       int vector_len = Assembler::AVX_512bit;
 406       int off = 0;
 407       for (int n = 16; n < num_xmm_regs; n++) {
 408         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 409       }
 410 #if COMPILER2_OR_JVMCI
 411       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 412       off = 0;
 413       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 414         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 415       }
 416 #endif
 417     }
 418   } else {
 419     if (VM_Version::supports_evex()) {
 420       // Restore upper bank of ZMM registers(16..31) for double/float usage
 421       int base_addr = XSAVE_AREA_UPPERBANK;
 422       int off = 0;
 423       for (int n = 16; n < num_xmm_regs; n++) {
 424         __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
 425       }
 426 #if COMPILER2_OR_JVMCI
 427       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 428       off = 0;
 429       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 430         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 431       }
 432 #endif
 433     }
 434   }
 435 
 436   // Recover CPU state
 437   __ pop_CPU_state();
 438   // Get the rbp described implicitly by the calling convention (no oopMap)
 439   __ pop(rbp);
 440 }
 441 
 442 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 443 
 444   // Just restore result register. Only used by deoptimization. By
 445   // now any callee save register that needs to be restored to a c2
 446   // caller of the deoptee has been extracted into the vframeArray
 447   // and will be stuffed into the c2i adapter we create for later
 448   // restoration so only result registers need to be restored here.
 449 
 450   // Restore fp result register
 451   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 452   // Restore integer result register
 453   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 454   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 455 
 456   // Pop all of the register save are off the stack except the return address
 457   __ addptr(rsp, return_offset_in_bytes());
 458 }
 459 
 460 // Is vector's size (in bytes) bigger than a size saved by default?
 461 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 462 bool SharedRuntime::is_wide_vector(int size) {
 463   return size > 16;
 464 }
 465 
 466 // ---------------------------------------------------------------------------
 467 // Read the array of BasicTypes from a signature, and compute where the
 468 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 469 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 470 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 471 // as framesizes are fixed.
 472 // VMRegImpl::stack0 refers to the first slot 0(sp).
 473 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 474 // up to RegisterImpl::number_of_registers) are the 64-bit
 475 // integer registers.
 476 
 477 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 478 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 479 // units regardless of build. Of course for i486 there is no 64 bit build
 480 
 481 // The Java calling convention is a "shifted" version of the C ABI.
 482 // By skipping the first C ABI register we can call non-static jni methods
 483 // with small numbers of arguments without having to shuffle the arguments
 484 // at all. Since we control the java ABI we ought to at least get some
 485 // advantage out of it.
 486 
 487 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 488                                            VMRegPair *regs,
 489                                            int total_args_passed) {
 490 
 491   // Create the mapping between argument positions and
 492   // registers.
 493   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 494     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 495   };
 496   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 497     j_farg0, j_farg1, j_farg2, j_farg3,
 498     j_farg4, j_farg5, j_farg6, j_farg7
 499   };
 500 
 501 
 502   uint int_args = 0;
 503   uint fp_args = 0;
 504   uint stk_args = 0; // inc by 2 each time
 505 
 506   for (int i = 0; i < total_args_passed; i++) {
 507     switch (sig_bt[i]) {
 508     case T_BOOLEAN:
 509     case T_CHAR:
 510     case T_BYTE:
 511     case T_SHORT:
 512     case T_INT:
 513       if (int_args < Argument::n_int_register_parameters_j) {
 514         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 515       } else {
 516         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 517         stk_args += 2;
 518       }
 519       break;
 520     case T_VOID:
 521       // halves of T_LONG or T_DOUBLE
 522       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 523       regs[i].set_bad();
 524       break;
 525     case T_LONG:
 526       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 527       // fall through
 528     case T_OBJECT:
 529     case T_ARRAY:
 530     case T_ADDRESS:
 531       if (int_args < Argument::n_int_register_parameters_j) {
 532         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 533       } else {
 534         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 535         stk_args += 2;
 536       }
 537       break;
 538     case T_FLOAT:
 539       if (fp_args < Argument::n_float_register_parameters_j) {
 540         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 541       } else {
 542         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 543         stk_args += 2;
 544       }
 545       break;
 546     case T_DOUBLE:
 547       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 548       if (fp_args < Argument::n_float_register_parameters_j) {
 549         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 550       } else {
 551         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 552         stk_args += 2;
 553       }
 554       break;
 555     default:
 556       ShouldNotReachHere();
 557       break;
 558     }
 559   }
 560 
 561   return align_up(stk_args, 2);
 562 }
 563 
 564 // Patch the callers callsite with entry to compiled code if it exists.
 565 static void patch_callers_callsite(MacroAssembler *masm) {
 566   Label L;
 567   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 568   __ jcc(Assembler::equal, L);
 569 
 570   // Save the current stack pointer
 571   __ mov(r13, rsp);
 572   // Schedule the branch target address early.
 573   // Call into the VM to patch the caller, then jump to compiled callee
 574   // rax isn't live so capture return address while we easily can
 575   __ movptr(rax, Address(rsp, 0));
 576 
 577   // align stack so push_CPU_state doesn't fault
 578   __ andptr(rsp, -(StackAlignmentInBytes));
 579   __ push_CPU_state();
 580   __ vzeroupper();
 581   // VM needs caller's callsite
 582   // VM needs target method
 583   // This needs to be a long call since we will relocate this adapter to
 584   // the codeBuffer and it may not reach
 585 
 586   // Allocate argument register save area
 587   if (frame::arg_reg_save_area_bytes != 0) {
 588     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 589   }
 590   __ mov(c_rarg0, rbx);
 591   __ mov(c_rarg1, rax);
 592   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 593 
 594   // De-allocate argument register save area
 595   if (frame::arg_reg_save_area_bytes != 0) {
 596     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 597   }
 598 
 599   __ vzeroupper();
 600   __ pop_CPU_state();
 601   // restore sp
 602   __ mov(rsp, r13);
 603   __ bind(L);
 604 }
 605 
 606 
 607 static void gen_c2i_adapter(MacroAssembler *masm,
 608                             int total_args_passed,
 609                             int comp_args_on_stack,
 610                             const BasicType *sig_bt,
 611                             const VMRegPair *regs,
 612                             Label& skip_fixup) {
 613   // Before we get into the guts of the C2I adapter, see if we should be here
 614   // at all.  We've come from compiled code and are attempting to jump to the
 615   // interpreter, which means the caller made a static call to get here
 616   // (vcalls always get a compiled target if there is one).  Check for a
 617   // compiled target.  If there is one, we need to patch the caller's call.
 618   patch_callers_callsite(masm);
 619 
 620   __ bind(skip_fixup);
 621 
 622   // Since all args are passed on the stack, total_args_passed *
 623   // Interpreter::stackElementSize is the space we need. Plus 1 because
 624   // we also account for the return address location since
 625   // we store it first rather than hold it in rax across all the shuffling
 626 
 627   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 628 
 629   // stack is aligned, keep it that way
 630   extraspace = align_up(extraspace, 2*wordSize);
 631 
 632   // Get return address
 633   __ pop(rax);
 634 
 635   // set senderSP value
 636   __ mov(r13, rsp);
 637 
 638   __ subptr(rsp, extraspace);
 639 
 640   // Store the return address in the expected location
 641   __ movptr(Address(rsp, 0), rax);
 642 
 643   // Now write the args into the outgoing interpreter space
 644   for (int i = 0; i < total_args_passed; i++) {
 645     if (sig_bt[i] == T_VOID) {
 646       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 647       continue;
 648     }
 649 
 650     // offset to start parameters
 651     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 652     int next_off = st_off - Interpreter::stackElementSize;
 653 
 654     // Say 4 args:
 655     // i   st_off
 656     // 0   32 T_LONG
 657     // 1   24 T_VOID
 658     // 2   16 T_OBJECT
 659     // 3    8 T_BOOL
 660     // -    0 return address
 661     //
 662     // However to make thing extra confusing. Because we can fit a long/double in
 663     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 664     // leaves one slot empty and only stores to a single slot. In this case the
 665     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 666 
 667     VMReg r_1 = regs[i].first();
 668     VMReg r_2 = regs[i].second();
 669     if (!r_1->is_valid()) {
 670       assert(!r_2->is_valid(), "");
 671       continue;
 672     }
 673     if (r_1->is_stack()) {
 674       // memory to memory use rax
 675       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 676       if (!r_2->is_valid()) {
 677         // sign extend??
 678         __ movl(rax, Address(rsp, ld_off));
 679         __ movptr(Address(rsp, st_off), rax);
 680 
 681       } else {
 682 
 683         __ movq(rax, Address(rsp, ld_off));
 684 
 685         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 686         // T_DOUBLE and T_LONG use two slots in the interpreter
 687         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 688           // ld_off == LSW, ld_off+wordSize == MSW
 689           // st_off == MSW, next_off == LSW
 690           __ movq(Address(rsp, next_off), rax);
 691 #ifdef ASSERT
 692           // Overwrite the unused slot with known junk
 693           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 694           __ movptr(Address(rsp, st_off), rax);
 695 #endif /* ASSERT */
 696         } else {
 697           __ movq(Address(rsp, st_off), rax);
 698         }
 699       }
 700     } else if (r_1->is_Register()) {
 701       Register r = r_1->as_Register();
 702       if (!r_2->is_valid()) {
 703         // must be only an int (or less ) so move only 32bits to slot
 704         // why not sign extend??
 705         __ movl(Address(rsp, st_off), r);
 706       } else {
 707         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 708         // T_DOUBLE and T_LONG use two slots in the interpreter
 709         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 710           // long/double in gpr
 711 #ifdef ASSERT
 712           // Overwrite the unused slot with known junk
 713           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 714           __ movptr(Address(rsp, st_off), rax);
 715 #endif /* ASSERT */
 716           __ movq(Address(rsp, next_off), r);
 717         } else {
 718           __ movptr(Address(rsp, st_off), r);
 719         }
 720       }
 721     } else {
 722       assert(r_1->is_XMMRegister(), "");
 723       if (!r_2->is_valid()) {
 724         // only a float use just part of the slot
 725         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 726       } else {
 727 #ifdef ASSERT
 728         // Overwrite the unused slot with known junk
 729         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 730         __ movptr(Address(rsp, st_off), rax);
 731 #endif /* ASSERT */
 732         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 733       }
 734     }
 735   }
 736 
 737   // Schedule the branch target address early.
 738   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 739   __ jmp(rcx);
 740 }
 741 
 742 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 743                         address code_start, address code_end,
 744                         Label& L_ok) {
 745   Label L_fail;
 746   __ lea(temp_reg, ExternalAddress(code_start));
 747   __ cmpptr(pc_reg, temp_reg);
 748   __ jcc(Assembler::belowEqual, L_fail);
 749   __ lea(temp_reg, ExternalAddress(code_end));
 750   __ cmpptr(pc_reg, temp_reg);
 751   __ jcc(Assembler::below, L_ok);
 752   __ bind(L_fail);
 753 }
 754 
 755 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 756                                     int total_args_passed,
 757                                     int comp_args_on_stack,
 758                                     const BasicType *sig_bt,
 759                                     const VMRegPair *regs) {
 760 
 761   // Note: r13 contains the senderSP on entry. We must preserve it since
 762   // we may do a i2c -> c2i transition if we lose a race where compiled
 763   // code goes non-entrant while we get args ready.
 764   // In addition we use r13 to locate all the interpreter args as
 765   // we must align the stack to 16 bytes on an i2c entry else we
 766   // lose alignment we expect in all compiled code and register
 767   // save code can segv when fxsave instructions find improperly
 768   // aligned stack pointer.
 769 
 770   // Adapters can be frameless because they do not require the caller
 771   // to perform additional cleanup work, such as correcting the stack pointer.
 772   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 773   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 774   // even if a callee has modified the stack pointer.
 775   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 776   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 777   // up via the senderSP register).
 778   // In other words, if *either* the caller or callee is interpreted, we can
 779   // get the stack pointer repaired after a call.
 780   // This is why c2i and i2c adapters cannot be indefinitely composed.
 781   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 782   // both caller and callee would be compiled methods, and neither would
 783   // clean up the stack pointer changes performed by the two adapters.
 784   // If this happens, control eventually transfers back to the compiled
 785   // caller, but with an uncorrected stack, causing delayed havoc.
 786 
 787   // Pick up the return address
 788   __ movptr(rax, Address(rsp, 0));
 789 
 790   if (VerifyAdapterCalls &&
 791       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 792     // So, let's test for cascading c2i/i2c adapters right now.
 793     //  assert(Interpreter::contains($return_addr) ||
 794     //         StubRoutines::contains($return_addr),
 795     //         "i2c adapter must return to an interpreter frame");
 796     __ block_comment("verify_i2c { ");
 797     Label L_ok;
 798     if (Interpreter::code() != NULL)
 799       range_check(masm, rax, r11,
 800                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 801                   L_ok);
 802     if (StubRoutines::code1() != NULL)
 803       range_check(masm, rax, r11,
 804                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 805                   L_ok);
 806     if (StubRoutines::code2() != NULL)
 807       range_check(masm, rax, r11,
 808                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 809                   L_ok);
 810     const char* msg = "i2c adapter must return to an interpreter frame";
 811     __ block_comment(msg);
 812     __ stop(msg);
 813     __ bind(L_ok);
 814     __ block_comment("} verify_i2ce ");
 815   }
 816 
 817   // Must preserve original SP for loading incoming arguments because
 818   // we need to align the outgoing SP for compiled code.
 819   __ movptr(r11, rsp);
 820 
 821   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
 822   // in registers, we will occasionally have no stack args.
 823   int comp_words_on_stack = 0;
 824   if (comp_args_on_stack) {
 825     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
 826     // registers are below.  By subtracting stack0, we either get a negative
 827     // number (all values in registers) or the maximum stack slot accessed.
 828 
 829     // Convert 4-byte c2 stack slots to words.
 830     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 831     // Round up to miminum stack alignment, in wordSize
 832     comp_words_on_stack = align_up(comp_words_on_stack, 2);
 833     __ subptr(rsp, comp_words_on_stack * wordSize);
 834   }
 835 
 836 
 837   // Ensure compiled code always sees stack at proper alignment
 838   __ andptr(rsp, -16);
 839 
 840   // push the return address and misalign the stack that youngest frame always sees
 841   // as far as the placement of the call instruction
 842   __ push(rax);
 843 
 844   // Put saved SP in another register
 845   const Register saved_sp = rax;
 846   __ movptr(saved_sp, r11);
 847 
 848   // Will jump to the compiled code just as if compiled code was doing it.
 849   // Pre-load the register-jump target early, to schedule it better.
 850   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 851 
 852 #if INCLUDE_JVMCI
 853   if (EnableJVMCI) {
 854     // check if this call should be routed towards a specific entry point
 855     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 856     Label no_alternative_target;
 857     __ jcc(Assembler::equal, no_alternative_target);
 858     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 859     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 860     __ bind(no_alternative_target);
 861   }
 862 #endif // INCLUDE_JVMCI
 863 
 864   // Now generate the shuffle code.  Pick up all register args and move the
 865   // rest through the floating point stack top.
 866   for (int i = 0; i < total_args_passed; i++) {
 867     if (sig_bt[i] == T_VOID) {
 868       // Longs and doubles are passed in native word order, but misaligned
 869       // in the 32-bit build.
 870       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 871       continue;
 872     }
 873 
 874     // Pick up 0, 1 or 2 words from SP+offset.
 875 
 876     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 877             "scrambled load targets?");
 878     // Load in argument order going down.
 879     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 880     // Point to interpreter value (vs. tag)
 881     int next_off = ld_off - Interpreter::stackElementSize;
 882     //
 883     //
 884     //
 885     VMReg r_1 = regs[i].first();
 886     VMReg r_2 = regs[i].second();
 887     if (!r_1->is_valid()) {
 888       assert(!r_2->is_valid(), "");
 889       continue;
 890     }
 891     if (r_1->is_stack()) {
 892       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 893       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 894 
 895       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 896       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 897       // will be generated.
 898       if (!r_2->is_valid()) {
 899         // sign extend???
 900         __ movl(r13, Address(saved_sp, ld_off));
 901         __ movptr(Address(rsp, st_off), r13);
 902       } else {
 903         //
 904         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 905         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 906         // So we must adjust where to pick up the data to match the interpreter.
 907         //
 908         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 909         // are accessed as negative so LSW is at LOW address
 910 
 911         // ld_off is MSW so get LSW
 912         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 913                            next_off : ld_off;
 914         __ movq(r13, Address(saved_sp, offset));
 915         // st_off is LSW (i.e. reg.first())
 916         __ movq(Address(rsp, st_off), r13);
 917       }
 918     } else if (r_1->is_Register()) {  // Register argument
 919       Register r = r_1->as_Register();
 920       assert(r != rax, "must be different");
 921       if (r_2->is_valid()) {
 922         //
 923         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 924         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 925         // So we must adjust where to pick up the data to match the interpreter.
 926 
 927         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 928                            next_off : ld_off;
 929 
 930         // this can be a misaligned move
 931         __ movq(r, Address(saved_sp, offset));
 932       } else {
 933         // sign extend and use a full word?
 934         __ movl(r, Address(saved_sp, ld_off));
 935       }
 936     } else {
 937       if (!r_2->is_valid()) {
 938         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 939       } else {
 940         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 941       }
 942     }
 943   }
 944 
 945   // 6243940 We might end up in handle_wrong_method if
 946   // the callee is deoptimized as we race thru here. If that
 947   // happens we don't want to take a safepoint because the
 948   // caller frame will look interpreted and arguments are now
 949   // "compiled" so it is much better to make this transition
 950   // invisible to the stack walking code. Unfortunately if
 951   // we try and find the callee by normal means a safepoint
 952   // is possible. So we stash the desired callee in the thread
 953   // and the vm will find there should this case occur.
 954 
 955   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 956 
 957   // put Method* where a c2i would expect should we end up there
 958   // only needed becaus eof c2 resolve stubs return Method* as a result in
 959   // rax
 960   __ mov(rax, rbx);
 961   __ jmp(r11);
 962 }
 963 
 964 // ---------------------------------------------------------------
 965 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 966                                                             int total_args_passed,
 967                                                             int comp_args_on_stack,
 968                                                             const BasicType *sig_bt,
 969                                                             const VMRegPair *regs,
 970                                                             AdapterFingerPrint* fingerprint) {
 971   address i2c_entry = __ pc();
 972 
 973   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 974 
 975   // -------------------------------------------------------------------------
 976   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 977   // to the interpreter.  The args start out packed in the compiled layout.  They
 978   // need to be unpacked into the interpreter layout.  This will almost always
 979   // require some stack space.  We grow the current (compiled) stack, then repack
 980   // the args.  We  finally end in a jump to the generic interpreter entry point.
 981   // On exit from the interpreter, the interpreter will restore our SP (lest the
 982   // compiled code, which relys solely on SP and not RBP, get sick).
 983 
 984   address c2i_unverified_entry = __ pc();
 985   Label skip_fixup;
 986   Label ok;
 987 
 988   Register holder = rax;
 989   Register receiver = j_rarg0;
 990   Register temp = rbx;
 991 
 992   {
 993     __ load_klass(temp, receiver, rscratch1);
 994     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
 995     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
 996     __ jcc(Assembler::equal, ok);
 997     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 998 
 999     __ bind(ok);
1000     // Method might have been compiled since the call site was patched to
1001     // interpreted if that is the case treat it as a miss so we can get
1002     // the call site corrected.
1003     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1004     __ jcc(Assembler::equal, skip_fixup);
1005     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1006   }
1007 
1008   address c2i_entry = __ pc();
1009 
1010   // Class initialization barrier for static methods
1011   address c2i_no_clinit_check_entry = NULL;
1012   if (VM_Version::supports_fast_class_init_checks()) {
1013     Label L_skip_barrier;
1014     Register method = rbx;
1015 
1016     { // Bypass the barrier for non-static methods
1017       Register flags  = rscratch1;
1018       __ movl(flags, Address(method, Method::access_flags_offset()));
1019       __ testl(flags, JVM_ACC_STATIC);
1020       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1021     }
1022 
1023     Register klass = rscratch1;
1024     __ load_method_holder(klass, method);
1025     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1026 
1027     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1028 
1029     __ bind(L_skip_barrier);
1030     c2i_no_clinit_check_entry = __ pc();
1031   }
1032 
1033   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1034   bs->c2i_entry_barrier(masm);
1035 
1036   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1037 
1038   __ flush();
1039   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1040 }
1041 
1042 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1043                                          VMRegPair *regs,
1044                                          VMRegPair *regs2,
1045                                          int total_args_passed) {
1046   assert(regs2 == NULL, "not needed on x86");
1047 // We return the amount of VMRegImpl stack slots we need to reserve for all
1048 // the arguments NOT counting out_preserve_stack_slots.
1049 
1050 // NOTE: These arrays will have to change when c1 is ported
1051 #ifdef _WIN64
1052     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1053       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1054     };
1055     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1056       c_farg0, c_farg1, c_farg2, c_farg3
1057     };
1058 #else
1059     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1060       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1061     };
1062     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1063       c_farg0, c_farg1, c_farg2, c_farg3,
1064       c_farg4, c_farg5, c_farg6, c_farg7
1065     };
1066 #endif // _WIN64
1067 
1068 
1069     uint int_args = 0;
1070     uint fp_args = 0;
1071     uint stk_args = 0; // inc by 2 each time
1072 
1073     for (int i = 0; i < total_args_passed; i++) {
1074       switch (sig_bt[i]) {
1075       case T_BOOLEAN:
1076       case T_CHAR:
1077       case T_BYTE:
1078       case T_SHORT:
1079       case T_INT:
1080         if (int_args < Argument::n_int_register_parameters_c) {
1081           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1082 #ifdef _WIN64
1083           fp_args++;
1084           // Allocate slots for callee to stuff register args the stack.
1085           stk_args += 2;
1086 #endif
1087         } else {
1088           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1089           stk_args += 2;
1090         }
1091         break;
1092       case T_LONG:
1093         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1094         // fall through
1095       case T_OBJECT:
1096       case T_ARRAY:
1097       case T_ADDRESS:
1098       case T_METADATA:
1099         if (int_args < Argument::n_int_register_parameters_c) {
1100           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1101 #ifdef _WIN64
1102           fp_args++;
1103           stk_args += 2;
1104 #endif
1105         } else {
1106           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1107           stk_args += 2;
1108         }
1109         break;
1110       case T_FLOAT:
1111         if (fp_args < Argument::n_float_register_parameters_c) {
1112           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1113 #ifdef _WIN64
1114           int_args++;
1115           // Allocate slots for callee to stuff register args the stack.
1116           stk_args += 2;
1117 #endif
1118         } else {
1119           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1120           stk_args += 2;
1121         }
1122         break;
1123       case T_DOUBLE:
1124         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1125         if (fp_args < Argument::n_float_register_parameters_c) {
1126           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1127 #ifdef _WIN64
1128           int_args++;
1129           // Allocate slots for callee to stuff register args the stack.
1130           stk_args += 2;
1131 #endif
1132         } else {
1133           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1134           stk_args += 2;
1135         }
1136         break;
1137       case T_VOID: // Halves of longs and doubles
1138         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1139         regs[i].set_bad();
1140         break;
1141       default:
1142         ShouldNotReachHere();
1143         break;
1144       }
1145     }
1146 #ifdef _WIN64
1147   // windows abi requires that we always allocate enough stack space
1148   // for 4 64bit registers to be stored down.
1149   if (stk_args < 8) {
1150     stk_args = 8;
1151   }
1152 #endif // _WIN64
1153 
1154   return stk_args;
1155 }
1156 
1157 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1158                                              uint num_bits,
1159                                              uint total_args_passed) {
1160   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1161          "only certain vector sizes are supported for now");
1162 
1163   static const XMMRegister VEC_ArgReg[32] = {
1164      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1165      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1166     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1167     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1168   };
1169 
1170   uint stk_args = 0;
1171   uint fp_args = 0;
1172 
1173   for (uint i = 0; i < total_args_passed; i++) {
1174     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1175     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1176     regs[i].set_pair(vmreg->next(next_val), vmreg);
1177   }
1178 
1179   return stk_args;
1180 }
1181 
1182 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1183   // We always ignore the frame_slots arg and just use the space just below frame pointer
1184   // which by this time is free to use
1185   switch (ret_type) {
1186   case T_FLOAT:
1187     __ movflt(Address(rbp, -wordSize), xmm0);
1188     break;
1189   case T_DOUBLE:
1190     __ movdbl(Address(rbp, -wordSize), xmm0);
1191     break;
1192   case T_VOID:  break;
1193   default: {
1194     __ movptr(Address(rbp, -wordSize), rax);
1195     }
1196   }
1197 }
1198 
1199 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1200   // We always ignore the frame_slots arg and just use the space just below frame pointer
1201   // which by this time is free to use
1202   switch (ret_type) {
1203   case T_FLOAT:
1204     __ movflt(xmm0, Address(rbp, -wordSize));
1205     break;
1206   case T_DOUBLE:
1207     __ movdbl(xmm0, Address(rbp, -wordSize));
1208     break;
1209   case T_VOID:  break;
1210   default: {
1211     __ movptr(rax, Address(rbp, -wordSize));
1212     }
1213   }
1214 }
1215 
1216 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1217     for ( int i = first_arg ; i < arg_count ; i++ ) {
1218       if (args[i].first()->is_Register()) {
1219         __ push(args[i].first()->as_Register());
1220       } else if (args[i].first()->is_XMMRegister()) {
1221         __ subptr(rsp, 2*wordSize);
1222         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1223       }
1224     }
1225 }
1226 
1227 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1228     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1229       if (args[i].first()->is_Register()) {
1230         __ pop(args[i].first()->as_Register());
1231       } else if (args[i].first()->is_XMMRegister()) {
1232         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1233         __ addptr(rsp, 2*wordSize);
1234       }
1235     }
1236 }
1237 
1238 // Different signatures may require very different orders for the move
1239 // to avoid clobbering other arguments.  There's no simple way to
1240 // order them safely.  Compute a safe order for issuing stores and
1241 // break any cycles in those stores.  This code is fairly general but
1242 // it's not necessary on the other platforms so we keep it in the
1243 // platform dependent code instead of moving it into a shared file.
1244 // (See bugs 7013347 & 7145024.)
1245 // Note that this code is specific to LP64.
1246 class ComputeMoveOrder: public StackObj {
1247   class MoveOperation: public ResourceObj {
1248     friend class ComputeMoveOrder;
1249    private:
1250     VMRegPair        _src;
1251     VMRegPair        _dst;
1252     int              _src_index;
1253     int              _dst_index;
1254     bool             _processed;
1255     MoveOperation*  _next;
1256     MoveOperation*  _prev;
1257 
1258     static int get_id(VMRegPair r) {
1259       return r.first()->value();
1260     }
1261 
1262    public:
1263     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1264       _src(src)
1265     , _dst(dst)
1266     , _src_index(src_index)
1267     , _dst_index(dst_index)
1268     , _processed(false)
1269     , _next(NULL)
1270     , _prev(NULL) {
1271     }
1272 
1273     VMRegPair src() const              { return _src; }
1274     int src_id() const                 { return get_id(src()); }
1275     int src_index() const              { return _src_index; }
1276     VMRegPair dst() const              { return _dst; }
1277     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1278     int dst_index() const              { return _dst_index; }
1279     int dst_id() const                 { return get_id(dst()); }
1280     MoveOperation* next() const       { return _next; }
1281     MoveOperation* prev() const       { return _prev; }
1282     void set_processed()               { _processed = true; }
1283     bool is_processed() const          { return _processed; }
1284 
1285     // insert
1286     void break_cycle(VMRegPair temp_register) {
1287       // create a new store following the last store
1288       // to move from the temp_register to the original
1289       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1290 
1291       // break the cycle of links and insert new_store at the end
1292       // break the reverse link.
1293       MoveOperation* p = prev();
1294       assert(p->next() == this, "must be");
1295       _prev = NULL;
1296       p->_next = new_store;
1297       new_store->_prev = p;
1298 
1299       // change the original store to save it's value in the temp.
1300       set_dst(-1, temp_register);
1301     }
1302 
1303     void link(GrowableArray<MoveOperation*>& killer) {
1304       // link this store in front the store that it depends on
1305       MoveOperation* n = killer.at_grow(src_id(), NULL);
1306       if (n != NULL) {
1307         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1308         _next = n;
1309         n->_prev = this;
1310       }
1311     }
1312   };
1313 
1314  private:
1315   GrowableArray<MoveOperation*> edges;
1316 
1317  public:
1318   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1319                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1320     // Move operations where the dest is the stack can all be
1321     // scheduled first since they can't interfere with the other moves.
1322     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1323       if (in_sig_bt[i] == T_ARRAY) {
1324         c_arg--;
1325         if (out_regs[c_arg].first()->is_stack() &&
1326             out_regs[c_arg + 1].first()->is_stack()) {
1327           arg_order.push(i);
1328           arg_order.push(c_arg);
1329         } else {
1330           if (out_regs[c_arg].first()->is_stack() ||
1331               in_regs[i].first() == out_regs[c_arg].first()) {
1332             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1333           } else {
1334             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1335           }
1336         }
1337       } else if (in_sig_bt[i] == T_VOID) {
1338         arg_order.push(i);
1339         arg_order.push(c_arg);
1340       } else {
1341         if (out_regs[c_arg].first()->is_stack() ||
1342             in_regs[i].first() == out_regs[c_arg].first()) {
1343           arg_order.push(i);
1344           arg_order.push(c_arg);
1345         } else {
1346           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1347         }
1348       }
1349     }
1350     // Break any cycles in the register moves and emit the in the
1351     // proper order.
1352     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1353     for (int i = 0; i < stores->length(); i++) {
1354       arg_order.push(stores->at(i)->src_index());
1355       arg_order.push(stores->at(i)->dst_index());
1356     }
1357  }
1358 
1359   // Collected all the move operations
1360   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1361     if (src.first() == dst.first()) return;
1362     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1363   }
1364 
1365   // Walk the edges breaking cycles between moves.  The result list
1366   // can be walked in order to produce the proper set of loads
1367   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1368     // Record which moves kill which values
1369     GrowableArray<MoveOperation*> killer;
1370     for (int i = 0; i < edges.length(); i++) {
1371       MoveOperation* s = edges.at(i);
1372       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1373       killer.at_put_grow(s->dst_id(), s, NULL);
1374     }
1375     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1376            "make sure temp isn't in the registers that are killed");
1377 
1378     // create links between loads and stores
1379     for (int i = 0; i < edges.length(); i++) {
1380       edges.at(i)->link(killer);
1381     }
1382 
1383     // at this point, all the move operations are chained together
1384     // in a doubly linked list.  Processing it backwards finds
1385     // the beginning of the chain, forwards finds the end.  If there's
1386     // a cycle it can be broken at any point,  so pick an edge and walk
1387     // backward until the list ends or we end where we started.
1388     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1389     for (int e = 0; e < edges.length(); e++) {
1390       MoveOperation* s = edges.at(e);
1391       if (!s->is_processed()) {
1392         MoveOperation* start = s;
1393         // search for the beginning of the chain or cycle
1394         while (start->prev() != NULL && start->prev() != s) {
1395           start = start->prev();
1396         }
1397         if (start->prev() == s) {
1398           start->break_cycle(temp_register);
1399         }
1400         // walk the chain forward inserting to store list
1401         while (start != NULL) {
1402           stores->append(start);
1403           start->set_processed();
1404           start = start->next();
1405         }
1406       }
1407     }
1408     return stores;
1409   }
1410 };
1411 
1412 static void verify_oop_args(MacroAssembler* masm,
1413                             const methodHandle& method,
1414                             const BasicType* sig_bt,
1415                             const VMRegPair* regs) {
1416   Register temp_reg = rbx;  // not part of any compiled calling seq
1417   if (VerifyOops) {
1418     for (int i = 0; i < method->size_of_parameters(); i++) {
1419       if (is_reference_type(sig_bt[i])) {
1420         VMReg r = regs[i].first();
1421         assert(r->is_valid(), "bad oop arg");
1422         if (r->is_stack()) {
1423           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1424           __ verify_oop(temp_reg);
1425         } else {
1426           __ verify_oop(r->as_Register());
1427         }
1428       }
1429     }
1430   }
1431 }
1432 
1433 static void gen_special_dispatch(MacroAssembler* masm,
1434                                  const methodHandle& method,
1435                                  const BasicType* sig_bt,
1436                                  const VMRegPair* regs) {
1437   verify_oop_args(masm, method, sig_bt, regs);
1438   vmIntrinsics::ID iid = method->intrinsic_id();
1439 
1440   // Now write the args into the outgoing interpreter space
1441   bool     has_receiver   = false;
1442   Register receiver_reg   = noreg;
1443   int      member_arg_pos = -1;
1444   Register member_reg     = noreg;
1445   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1446   if (ref_kind != 0) {
1447     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1448     member_reg = rbx;  // known to be free at this point
1449     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1450   } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1451     has_receiver = true;
1452   } else {
1453     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1454   }
1455 
1456   if (member_reg != noreg) {
1457     // Load the member_arg into register, if necessary.
1458     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1459     VMReg r = regs[member_arg_pos].first();
1460     if (r->is_stack()) {
1461       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1462     } else {
1463       // no data motion is needed
1464       member_reg = r->as_Register();
1465     }
1466   }
1467 
1468   if (has_receiver) {
1469     // Make sure the receiver is loaded into a register.
1470     assert(method->size_of_parameters() > 0, "oob");
1471     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1472     VMReg r = regs[0].first();
1473     assert(r->is_valid(), "bad receiver arg");
1474     if (r->is_stack()) {
1475       // Porting note:  This assumes that compiled calling conventions always
1476       // pass the receiver oop in a register.  If this is not true on some
1477       // platform, pick a temp and load the receiver from stack.
1478       fatal("receiver always in a register");
1479       receiver_reg = j_rarg0;  // known to be free at this point
1480       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1481     } else {
1482       // no data motion is needed
1483       receiver_reg = r->as_Register();
1484     }
1485   }
1486 
1487   // Figure out which address we are really jumping to:
1488   MethodHandles::generate_method_handle_dispatch(masm, iid,
1489                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1490 }
1491 
1492 // ---------------------------------------------------------------------------
1493 // Generate a native wrapper for a given method.  The method takes arguments
1494 // in the Java compiled code convention, marshals them to the native
1495 // convention (handlizes oops, etc), transitions to native, makes the call,
1496 // returns to java state (possibly blocking), unhandlizes any result and
1497 // returns.
1498 //
1499 // Critical native functions are a shorthand for the use of
1500 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1501 // functions.  The wrapper is expected to unpack the arguments before
1502 // passing them to the callee. Critical native functions leave the state _in_Java,
1503 // since they cannot stop for GC.
1504 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1505 // block and the check for pending exceptions it's impossible for them
1506 // to be thrown.
1507 //
1508 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1509                                                 const methodHandle& method,
1510                                                 int compile_id,
1511                                                 BasicType* in_sig_bt,
1512                                                 VMRegPair* in_regs,
1513                                                 BasicType ret_type) {
1514   if (method->is_method_handle_intrinsic()) {
1515     vmIntrinsics::ID iid = method->intrinsic_id();
1516     intptr_t start = (intptr_t)__ pc();
1517     int vep_offset = ((intptr_t)__ pc()) - start;
1518     gen_special_dispatch(masm,
1519                          method,
1520                          in_sig_bt,
1521                          in_regs);
1522     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1523     __ flush();
1524     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1525     return nmethod::new_native_nmethod(method,
1526                                        compile_id,
1527                                        masm->code(),
1528                                        vep_offset,
1529                                        frame_complete,
1530                                        stack_slots / VMRegImpl::slots_per_word,
1531                                        in_ByteSize(-1),
1532                                        in_ByteSize(-1),
1533                                        (OopMapSet*)NULL);
1534   }
1535   address native_func = method->native_function();
1536   assert(native_func != NULL, "must have function");
1537 
1538   // An OopMap for lock (and class if static)
1539   OopMapSet *oop_maps = new OopMapSet();
1540   intptr_t start = (intptr_t)__ pc();
1541 
1542   // We have received a description of where all the java arg are located
1543   // on entry to the wrapper. We need to convert these args to where
1544   // the jni function will expect them. To figure out where they go
1545   // we convert the java signature to a C signature by inserting
1546   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1547 
1548   const int total_in_args = method->size_of_parameters();
1549   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1550 
1551   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1552   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1553   BasicType* in_elem_bt = NULL;
1554 
1555   int argc = 0;
1556   out_sig_bt[argc++] = T_ADDRESS;
1557   if (method->is_static()) {
1558     out_sig_bt[argc++] = T_OBJECT;
1559   }
1560 
1561   for (int i = 0; i < total_in_args ; i++ ) {
1562     out_sig_bt[argc++] = in_sig_bt[i];
1563   }
1564 
1565   // Now figure out where the args must be stored and how much stack space
1566   // they require.
1567   int out_arg_slots;
1568   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1569 
1570   // Compute framesize for the wrapper.  We need to handlize all oops in
1571   // incoming registers
1572 
1573   // Calculate the total number of stack slots we will need.
1574 
1575   // First count the abi requirement plus all of the outgoing args
1576   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1577 
1578   // Now the space for the inbound oop handle area
1579   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1580 
1581   int oop_handle_offset = stack_slots;
1582   stack_slots += total_save_slots;
1583 
1584   // Now any space we need for handlizing a klass if static method
1585 
1586   int klass_slot_offset = 0;
1587   int klass_offset = -1;
1588   int lock_slot_offset = 0;
1589   bool is_static = false;
1590 
1591   if (method->is_static()) {
1592     klass_slot_offset = stack_slots;
1593     stack_slots += VMRegImpl::slots_per_word;
1594     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1595     is_static = true;
1596   }
1597 
1598   // Plus a lock if needed
1599 
1600   if (method->is_synchronized()) {
1601     lock_slot_offset = stack_slots;
1602     stack_slots += VMRegImpl::slots_per_word;
1603   }
1604 
1605   // Now a place (+2) to save return values or temp during shuffling
1606   // + 4 for return address (which we own) and saved rbp
1607   stack_slots += 6;
1608 
1609   // Ok The space we have allocated will look like:
1610   //
1611   //
1612   // FP-> |                     |
1613   //      |---------------------|
1614   //      | 2 slots for moves   |
1615   //      |---------------------|
1616   //      | lock box (if sync)  |
1617   //      |---------------------| <- lock_slot_offset
1618   //      | klass (if static)   |
1619   //      |---------------------| <- klass_slot_offset
1620   //      | oopHandle area      |
1621   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1622   //      | outbound memory     |
1623   //      | based arguments     |
1624   //      |                     |
1625   //      |---------------------|
1626   //      |                     |
1627   // SP-> | out_preserved_slots |
1628   //
1629   //
1630 
1631 
1632   // Now compute actual number of stack words we need rounding to make
1633   // stack properly aligned.
1634   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1635 
1636   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1637 
1638   // First thing make an ic check to see if we should even be here
1639 
1640   // We are free to use all registers as temps without saving them and
1641   // restoring them except rbp. rbp is the only callee save register
1642   // as far as the interpreter and the compiler(s) are concerned.
1643 
1644 
1645   const Register ic_reg = rax;
1646   const Register receiver = j_rarg0;
1647 
1648   Label hit;
1649   Label exception_pending;
1650 
1651   assert_different_registers(ic_reg, receiver, rscratch1);
1652   __ verify_oop(receiver);
1653   __ load_klass(rscratch1, receiver, rscratch2);
1654   __ cmpq(ic_reg, rscratch1);
1655   __ jcc(Assembler::equal, hit);
1656 
1657   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1658 
1659   // Verified entry point must be aligned
1660   __ align(8);
1661 
1662   __ bind(hit);
1663 
1664   int vep_offset = ((intptr_t)__ pc()) - start;
1665 
1666   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1667     Label L_skip_barrier;
1668     Register klass = r10;
1669     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1670     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1671 
1672     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1673 
1674     __ bind(L_skip_barrier);
1675   }
1676 
1677 #ifdef COMPILER1
1678   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1679   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1680     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1681   }
1682 #endif // COMPILER1
1683 
1684   // The instruction at the verified entry point must be 5 bytes or longer
1685   // because it can be patched on the fly by make_non_entrant. The stack bang
1686   // instruction fits that requirement.
1687 
1688   // Generate stack overflow check
1689   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1690 
1691   // Generate a new frame for the wrapper.
1692   __ enter();
1693   // -2 because return address is already present and so is saved rbp
1694   __ subptr(rsp, stack_size - 2*wordSize);
1695 
1696   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1697   bs->nmethod_entry_barrier(masm);
1698 
1699   // Frame is now completed as far as size and linkage.
1700   int frame_complete = ((intptr_t)__ pc()) - start;
1701 
1702     if (UseRTMLocking) {
1703       // Abort RTM transaction before calling JNI
1704       // because critical section will be large and will be
1705       // aborted anyway. Also nmethod could be deoptimized.
1706       __ xabort(0);
1707     }
1708 
1709 #ifdef ASSERT
1710     {
1711       Label L;
1712       __ mov(rax, rsp);
1713       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1714       __ cmpptr(rax, rsp);
1715       __ jcc(Assembler::equal, L);
1716       __ stop("improperly aligned stack");
1717       __ bind(L);
1718     }
1719 #endif /* ASSERT */
1720 
1721 
1722   // We use r14 as the oop handle for the receiver/klass
1723   // It is callee save so it survives the call to native
1724 
1725   const Register oop_handle_reg = r14;
1726 
1727   //
1728   // We immediately shuffle the arguments so that any vm call we have to
1729   // make from here on out (sync slow path, jvmti, etc.) we will have
1730   // captured the oops from our caller and have a valid oopMap for
1731   // them.
1732 
1733   // -----------------
1734   // The Grand Shuffle
1735 
1736   // The Java calling convention is either equal (linux) or denser (win64) than the
1737   // c calling convention. However the because of the jni_env argument the c calling
1738   // convention always has at least one more (and two for static) arguments than Java.
1739   // Therefore if we move the args from java -> c backwards then we will never have
1740   // a register->register conflict and we don't have to build a dependency graph
1741   // and figure out how to break any cycles.
1742   //
1743 
1744   // Record esp-based slot for receiver on stack for non-static methods
1745   int receiver_offset = -1;
1746 
1747   // This is a trick. We double the stack slots so we can claim
1748   // the oops in the caller's frame. Since we are sure to have
1749   // more args than the caller doubling is enough to make
1750   // sure we can capture all the incoming oop args from the
1751   // caller.
1752   //
1753   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1754 
1755   // Mark location of rbp (someday)
1756   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1757 
1758   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1759   // All inbound args are referenced based on rbp and all outbound args via rsp.
1760 
1761 
1762 #ifdef ASSERT
1763   bool reg_destroyed[RegisterImpl::number_of_registers];
1764   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1765   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1766     reg_destroyed[r] = false;
1767   }
1768   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1769     freg_destroyed[f] = false;
1770   }
1771 
1772 #endif /* ASSERT */
1773 
1774   // For JNI natives the incoming and outgoing registers are offset upwards.
1775   GrowableArray<int> arg_order(2 * total_in_args);
1776 
1777   VMRegPair tmp_vmreg;
1778   tmp_vmreg.set2(rbx->as_VMReg());
1779 
1780   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1781     arg_order.push(i);
1782     arg_order.push(c_arg);
1783   }
1784 
1785   int temploc = -1;
1786   for (int ai = 0; ai < arg_order.length(); ai += 2) {
1787     int i = arg_order.at(ai);
1788     int c_arg = arg_order.at(ai + 1);
1789     __ block_comment(err_msg("move %d -> %d", i, c_arg));
1790 #ifdef ASSERT
1791     if (in_regs[i].first()->is_Register()) {
1792       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1793     } else if (in_regs[i].first()->is_XMMRegister()) {
1794       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1795     }
1796     if (out_regs[c_arg].first()->is_Register()) {
1797       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1798     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1799       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1800     }
1801 #endif /* ASSERT */
1802     switch (in_sig_bt[i]) {
1803       case T_ARRAY:
1804       case T_OBJECT:
1805         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1806                     ((i == 0) && (!is_static)),
1807                     &receiver_offset);
1808         break;
1809       case T_VOID:
1810         break;
1811 
1812       case T_FLOAT:
1813         __ float_move(in_regs[i], out_regs[c_arg]);
1814           break;
1815 
1816       case T_DOUBLE:
1817         assert( i + 1 < total_in_args &&
1818                 in_sig_bt[i + 1] == T_VOID &&
1819                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1820         __ double_move(in_regs[i], out_regs[c_arg]);
1821         break;
1822 
1823       case T_LONG :
1824         __ long_move(in_regs[i], out_regs[c_arg]);
1825         break;
1826 
1827       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1828 
1829       default:
1830         __ move32_64(in_regs[i], out_regs[c_arg]);
1831     }
1832   }
1833 
1834   int c_arg;
1835 
1836   // Pre-load a static method's oop into r14.  Used both by locking code and
1837   // the normal JNI call code.
1838   // point c_arg at the first arg that is already loaded in case we
1839   // need to spill before we call out
1840   c_arg = total_c_args - total_in_args;
1841 
1842   if (method->is_static()) {
1843 
1844     //  load oop into a register
1845     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1846 
1847     // Now handlize the static class mirror it's known not-null.
1848     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1849     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1850 
1851     // Now get the handle
1852     __ lea(oop_handle_reg, Address(rsp, klass_offset));
1853     // store the klass handle as second argument
1854     __ movptr(c_rarg1, oop_handle_reg);
1855     // and protect the arg if we must spill
1856     c_arg--;
1857   }
1858 
1859   // Change state to native (we save the return address in the thread, since it might not
1860   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
1861   // points into the right code segment. It does not have to be the correct return pc.
1862   // We use the same pc/oopMap repeatedly when we call out
1863 
1864   intptr_t the_pc = (intptr_t) __ pc();
1865   oop_maps->add_gc_map(the_pc - start, map);
1866 
1867   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
1868 
1869 
1870   // We have all of the arguments setup at this point. We must not touch any register
1871   // argument registers at this point (what if we save/restore them there are no oop?
1872 
1873   {
1874     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
1875     // protect the args we've loaded
1876     save_args(masm, total_c_args, c_arg, out_regs);
1877     __ mov_metadata(c_rarg1, method());
1878     __ call_VM_leaf(
1879       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
1880       r15_thread, c_rarg1);
1881     restore_args(masm, total_c_args, c_arg, out_regs);
1882   }
1883 
1884   // RedefineClasses() tracing support for obsolete method entry
1885   if (log_is_enabled(Trace, redefine, class, obsolete)) {
1886     // protect the args we've loaded
1887     save_args(masm, total_c_args, c_arg, out_regs);
1888     __ mov_metadata(c_rarg1, method());
1889     __ call_VM_leaf(
1890       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
1891       r15_thread, c_rarg1);
1892     restore_args(masm, total_c_args, c_arg, out_regs);
1893   }
1894 
1895   // Lock a synchronized method
1896 
1897   // Register definitions used by locking and unlocking
1898 
1899   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
1900   const Register obj_reg  = rbx;  // Will contain the oop
1901   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
1902   const Register old_hdr  = r13;  // value of old header at unlock time
1903 
1904   Label slow_path_lock;
1905   Label lock_done;
1906 
1907   if (method->is_synchronized()) {
1908 
1909     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
1910 
1911     // Get the handle (the 2nd argument)
1912     __ mov(oop_handle_reg, c_rarg1);
1913 
1914     // Get address of the box
1915 
1916     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
1917 
1918     // Load the oop from the handle
1919     __ movptr(obj_reg, Address(oop_handle_reg, 0));
1920 
1921     if (!UseHeavyMonitors) {
1922       // Load immediate 1 into swap_reg %rax
1923       __ movl(swap_reg, 1);
1924 
1925       // Load (object->mark() | 1) into swap_reg %rax
1926       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1927 
1928       // Save (object->mark() | 1) into BasicLock's displaced header
1929       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
1930 
1931       // src -> dest iff dest == rax else rax <- dest
1932       __ lock();
1933       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1934       __ jcc(Assembler::equal, lock_done);
1935 
1936       // Hmm should this move to the slow path code area???
1937 
1938       // Test if the oopMark is an obvious stack pointer, i.e.,
1939       //  1) (mark & 3) == 0, and
1940       //  2) rsp <= mark < mark + os::pagesize()
1941       // These 3 tests can be done by evaluating the following
1942       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
1943       // assuming both stack pointer and pagesize have their
1944       // least significant 2 bits clear.
1945       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
1946 
1947       __ subptr(swap_reg, rsp);
1948       __ andptr(swap_reg, 3 - os::vm_page_size());
1949 
1950       // Save the test result, for recursive case, the result is zero
1951       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
1952       __ jcc(Assembler::notEqual, slow_path_lock);
1953     } else {
1954       __ jmp(slow_path_lock);
1955     }
1956 
1957     // Slow path will re-enter here
1958 
1959     __ bind(lock_done);
1960   }
1961 
1962   // Finally just about ready to make the JNI call
1963 
1964   // get JNIEnv* which is first argument to native
1965   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
1966 
1967   // Now set thread in native
1968   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
1969 
1970   __ call(RuntimeAddress(native_func));
1971 
1972   // Verify or restore cpu control state after JNI call
1973   __ restore_cpu_control_state_after_jni();
1974 
1975   // Unpack native results.
1976   switch (ret_type) {
1977   case T_BOOLEAN: __ c2bool(rax);            break;
1978   case T_CHAR   : __ movzwl(rax, rax);      break;
1979   case T_BYTE   : __ sign_extend_byte (rax); break;
1980   case T_SHORT  : __ sign_extend_short(rax); break;
1981   case T_INT    : /* nothing to do */        break;
1982   case T_DOUBLE :
1983   case T_FLOAT  :
1984     // Result is in xmm0 we'll save as needed
1985     break;
1986   case T_ARRAY:                 // Really a handle
1987   case T_OBJECT:                // Really a handle
1988       break; // can't de-handlize until after safepoint check
1989   case T_VOID: break;
1990   case T_LONG: break;
1991   default       : ShouldNotReachHere();
1992   }
1993 
1994   Label after_transition;
1995 
1996   // Switch thread to "native transition" state before reading the synchronization state.
1997   // This additional state is necessary because reading and testing the synchronization
1998   // state is not atomic w.r.t. GC, as this scenario demonstrates:
1999   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2000   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2001   //     Thread A is resumed to finish this native method, but doesn't block here since it
2002   //     didn't see any synchronization is progress, and escapes.
2003   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2004 
2005   // Force this write out before the read below
2006   __ membar(Assembler::Membar_mask_bits(
2007               Assembler::LoadLoad | Assembler::LoadStore |
2008               Assembler::StoreLoad | Assembler::StoreStore));
2009 
2010   // check for safepoint operation in progress and/or pending suspend requests
2011   {
2012     Label Continue;
2013     Label slow_path;
2014 
2015     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2016 
2017     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2018     __ jcc(Assembler::equal, Continue);
2019     __ bind(slow_path);
2020 
2021     // Don't use call_VM as it will see a possible pending exception and forward it
2022     // and never return here preventing us from clearing _last_native_pc down below.
2023     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2024     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2025     // by hand.
2026     //
2027     __ vzeroupper();
2028     save_native_result(masm, ret_type, stack_slots);
2029     __ mov(c_rarg0, r15_thread);
2030     __ mov(r12, rsp); // remember sp
2031     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2032     __ andptr(rsp, -16); // align stack as required by ABI
2033     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2034     __ mov(rsp, r12); // restore sp
2035     __ reinit_heapbase();
2036     // Restore any method result value
2037     restore_native_result(masm, ret_type, stack_slots);
2038     __ bind(Continue);
2039   }
2040 
2041   // change thread state
2042   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2043   __ bind(after_transition);
2044 
2045   Label reguard;
2046   Label reguard_done;
2047   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2048   __ jcc(Assembler::equal, reguard);
2049   __ bind(reguard_done);
2050 
2051   // native result if any is live
2052 
2053   // Unlock
2054   Label unlock_done;
2055   Label slow_path_unlock;
2056   if (method->is_synchronized()) {
2057 
2058     // Get locked oop from the handle we passed to jni
2059     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2060 
2061     Label done;
2062 
2063     if (!UseHeavyMonitors) {
2064       // Simple recursive lock?
2065       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2066       __ jcc(Assembler::equal, done);
2067     }
2068 
2069     // Must save rax if it is live now because cmpxchg must use it
2070     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2071       save_native_result(masm, ret_type, stack_slots);
2072     }
2073 
2074 
2075     if (!UseHeavyMonitors) {
2076       // get address of the stack lock
2077       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2078       //  get old displaced header
2079       __ movptr(old_hdr, Address(rax, 0));
2080 
2081       // Atomic swap old header if oop still contains the stack lock
2082       __ lock();
2083       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2084       __ jcc(Assembler::notEqual, slow_path_unlock);
2085     } else {
2086       __ jmp(slow_path_unlock);
2087     }
2088 
2089     // slow path re-enters here
2090     __ bind(unlock_done);
2091     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2092       restore_native_result(masm, ret_type, stack_slots);
2093     }
2094 
2095     __ bind(done);
2096 
2097   }
2098   {
2099     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2100     save_native_result(masm, ret_type, stack_slots);
2101     __ mov_metadata(c_rarg1, method());
2102     __ call_VM_leaf(
2103          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2104          r15_thread, c_rarg1);
2105     restore_native_result(masm, ret_type, stack_slots);
2106   }
2107 
2108   __ reset_last_Java_frame(false);
2109 
2110   // Unbox oop result, e.g. JNIHandles::resolve value.
2111   if (is_reference_type(ret_type)) {
2112     __ resolve_jobject(rax /* value */,
2113                        r15_thread /* thread */,
2114                        rcx /* tmp */);
2115   }
2116 
2117   if (CheckJNICalls) {
2118     // clear_pending_jni_exception_check
2119     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2120   }
2121 
2122   // reset handle block
2123   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2124   __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2125 
2126   // pop our frame
2127 
2128   __ leave();
2129 
2130   // Any exception pending?
2131   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2132   __ jcc(Assembler::notEqual, exception_pending);
2133 
2134   // Return
2135 
2136   __ ret(0);
2137 
2138   // Unexpected paths are out of line and go here
2139 
2140   // forward the exception
2141   __ bind(exception_pending);
2142 
2143   // and forward the exception
2144   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2145 
2146   // Slow path locking & unlocking
2147   if (method->is_synchronized()) {
2148 
2149     // BEGIN Slow path lock
2150     __ bind(slow_path_lock);
2151 
2152     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2153     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2154 
2155     // protect the args we've loaded
2156     save_args(masm, total_c_args, c_arg, out_regs);
2157 
2158     __ mov(c_rarg0, obj_reg);
2159     __ mov(c_rarg1, lock_reg);
2160     __ mov(c_rarg2, r15_thread);
2161 
2162     // Not a leaf but we have last_Java_frame setup as we want
2163     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2164     restore_args(masm, total_c_args, c_arg, out_regs);
2165 
2166 #ifdef ASSERT
2167     { Label L;
2168     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2169     __ jcc(Assembler::equal, L);
2170     __ stop("no pending exception allowed on exit from monitorenter");
2171     __ bind(L);
2172     }
2173 #endif
2174     __ jmp(lock_done);
2175 
2176     // END Slow path lock
2177 
2178     // BEGIN Slow path unlock
2179     __ bind(slow_path_unlock);
2180 
2181     // If we haven't already saved the native result we must save it now as xmm registers
2182     // are still exposed.
2183     __ vzeroupper();
2184     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2185       save_native_result(masm, ret_type, stack_slots);
2186     }
2187 
2188     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2189 
2190     __ mov(c_rarg0, obj_reg);
2191     __ mov(c_rarg2, r15_thread);
2192     __ mov(r12, rsp); // remember sp
2193     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2194     __ andptr(rsp, -16); // align stack as required by ABI
2195 
2196     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2197     // NOTE that obj_reg == rbx currently
2198     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2199     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2200 
2201     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2202     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2203     __ mov(rsp, r12); // restore sp
2204     __ reinit_heapbase();
2205 #ifdef ASSERT
2206     {
2207       Label L;
2208       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2209       __ jcc(Assembler::equal, L);
2210       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2211       __ bind(L);
2212     }
2213 #endif /* ASSERT */
2214 
2215     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2216 
2217     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2218       restore_native_result(masm, ret_type, stack_slots);
2219     }
2220     __ jmp(unlock_done);
2221 
2222     // END Slow path unlock
2223 
2224   } // synchronized
2225 
2226   // SLOW PATH Reguard the stack if needed
2227 
2228   __ bind(reguard);
2229   __ vzeroupper();
2230   save_native_result(masm, ret_type, stack_slots);
2231   __ mov(r12, rsp); // remember sp
2232   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2233   __ andptr(rsp, -16); // align stack as required by ABI
2234   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2235   __ mov(rsp, r12); // restore sp
2236   __ reinit_heapbase();
2237   restore_native_result(masm, ret_type, stack_slots);
2238   // and continue
2239   __ jmp(reguard_done);
2240 
2241 
2242 
2243   __ flush();
2244 
2245   nmethod *nm = nmethod::new_native_nmethod(method,
2246                                             compile_id,
2247                                             masm->code(),
2248                                             vep_offset,
2249                                             frame_complete,
2250                                             stack_slots / VMRegImpl::slots_per_word,
2251                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2252                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2253                                             oop_maps);
2254 
2255   return nm;
2256 }
2257 
2258 // this function returns the adjust size (in number of words) to a c2i adapter
2259 // activation for use during deoptimization
2260 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2261   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2262 }
2263 
2264 
2265 uint SharedRuntime::out_preserve_stack_slots() {
2266   return 0;
2267 }
2268 
2269 
2270 // Number of stack slots between incoming argument block and the start of
2271 // a new frame.  The PROLOG must add this many slots to the stack.  The
2272 // EPILOG must remove this many slots.  amd64 needs two slots for
2273 // return address.
2274 uint SharedRuntime::in_preserve_stack_slots() {
2275   return 4 + 2 * VerifyStackAtCalls;
2276 }
2277 
2278 //------------------------------generate_deopt_blob----------------------------
2279 void SharedRuntime::generate_deopt_blob() {
2280   // Allocate space for the code
2281   ResourceMark rm;
2282   // Setup code generation tools
2283   int pad = 0;
2284   if (UseAVX > 2) {
2285     pad += 1024;
2286   }
2287 #if INCLUDE_JVMCI
2288   if (EnableJVMCI) {
2289     pad += 512; // Increase the buffer size when compiling for JVMCI
2290   }
2291 #endif
2292   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2293   MacroAssembler* masm = new MacroAssembler(&buffer);
2294   int frame_size_in_words;
2295   OopMap* map = NULL;
2296   OopMapSet *oop_maps = new OopMapSet();
2297 
2298   // -------------
2299   // This code enters when returning to a de-optimized nmethod.  A return
2300   // address has been pushed on the the stack, and return values are in
2301   // registers.
2302   // If we are doing a normal deopt then we were called from the patched
2303   // nmethod from the point we returned to the nmethod. So the return
2304   // address on the stack is wrong by NativeCall::instruction_size
2305   // We will adjust the value so it looks like we have the original return
2306   // address on the stack (like when we eagerly deoptimized).
2307   // In the case of an exception pending when deoptimizing, we enter
2308   // with a return address on the stack that points after the call we patched
2309   // into the exception handler. We have the following register state from,
2310   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2311   //    rax: exception oop
2312   //    rbx: exception handler
2313   //    rdx: throwing pc
2314   // So in this case we simply jam rdx into the useless return address and
2315   // the stack looks just like we want.
2316   //
2317   // At this point we need to de-opt.  We save the argument return
2318   // registers.  We call the first C routine, fetch_unroll_info().  This
2319   // routine captures the return values and returns a structure which
2320   // describes the current frame size and the sizes of all replacement frames.
2321   // The current frame is compiled code and may contain many inlined
2322   // functions, each with their own JVM state.  We pop the current frame, then
2323   // push all the new frames.  Then we call the C routine unpack_frames() to
2324   // populate these frames.  Finally unpack_frames() returns us the new target
2325   // address.  Notice that callee-save registers are BLOWN here; they have
2326   // already been captured in the vframeArray at the time the return PC was
2327   // patched.
2328   address start = __ pc();
2329   Label cont;
2330 
2331   // Prolog for non exception case!
2332 
2333   // Save everything in sight.
2334   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2335 
2336   // Normal deoptimization.  Save exec mode for unpack_frames.
2337   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2338   __ jmp(cont);
2339 
2340   int reexecute_offset = __ pc() - start;
2341 #if INCLUDE_JVMCI && !defined(COMPILER1)
2342   if (EnableJVMCI && UseJVMCICompiler) {
2343     // JVMCI does not use this kind of deoptimization
2344     __ should_not_reach_here();
2345   }
2346 #endif
2347 
2348   // Reexecute case
2349   // return address is the pc describes what bci to do re-execute at
2350 
2351   // No need to update map as each call to save_live_registers will produce identical oopmap
2352   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2353 
2354   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2355   __ jmp(cont);
2356 
2357 #if INCLUDE_JVMCI
2358   Label after_fetch_unroll_info_call;
2359   int implicit_exception_uncommon_trap_offset = 0;
2360   int uncommon_trap_offset = 0;
2361 
2362   if (EnableJVMCI) {
2363     implicit_exception_uncommon_trap_offset = __ pc() - start;
2364 
2365     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2366     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2367 
2368     uncommon_trap_offset = __ pc() - start;
2369 
2370     // Save everything in sight.
2371     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2372     // fetch_unroll_info needs to call last_java_frame()
2373     __ set_last_Java_frame(noreg, noreg, NULL);
2374 
2375     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2376     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2377 
2378     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2379     __ mov(c_rarg0, r15_thread);
2380     __ movl(c_rarg2, r14); // exec mode
2381     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2382     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2383 
2384     __ reset_last_Java_frame(false);
2385 
2386     __ jmp(after_fetch_unroll_info_call);
2387   } // EnableJVMCI
2388 #endif // INCLUDE_JVMCI
2389 
2390   int exception_offset = __ pc() - start;
2391 
2392   // Prolog for exception case
2393 
2394   // all registers are dead at this entry point, except for rax, and
2395   // rdx which contain the exception oop and exception pc
2396   // respectively.  Set them in TLS and fall thru to the
2397   // unpack_with_exception_in_tls entry point.
2398 
2399   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2400   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2401 
2402   int exception_in_tls_offset = __ pc() - start;
2403 
2404   // new implementation because exception oop is now passed in JavaThread
2405 
2406   // Prolog for exception case
2407   // All registers must be preserved because they might be used by LinearScan
2408   // Exceptiop oop and throwing PC are passed in JavaThread
2409   // tos: stack at point of call to method that threw the exception (i.e. only
2410   // args are on the stack, no return address)
2411 
2412   // make room on stack for the return address
2413   // It will be patched later with the throwing pc. The correct value is not
2414   // available now because loading it from memory would destroy registers.
2415   __ push(0);
2416 
2417   // Save everything in sight.
2418   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2419 
2420   // Now it is safe to overwrite any register
2421 
2422   // Deopt during an exception.  Save exec mode for unpack_frames.
2423   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2424 
2425   // load throwing pc from JavaThread and patch it as the return address
2426   // of the current frame. Then clear the field in JavaThread
2427 
2428   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2429   __ movptr(Address(rbp, wordSize), rdx);
2430   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2431 
2432 #ifdef ASSERT
2433   // verify that there is really an exception oop in JavaThread
2434   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2435   __ verify_oop(rax);
2436 
2437   // verify that there is no pending exception
2438   Label no_pending_exception;
2439   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2440   __ testptr(rax, rax);
2441   __ jcc(Assembler::zero, no_pending_exception);
2442   __ stop("must not have pending exception here");
2443   __ bind(no_pending_exception);
2444 #endif
2445 
2446   __ bind(cont);
2447 
2448   // Call C code.  Need thread and this frame, but NOT official VM entry
2449   // crud.  We cannot block on this call, no GC can happen.
2450   //
2451   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2452 
2453   // fetch_unroll_info needs to call last_java_frame().
2454 
2455   __ set_last_Java_frame(noreg, noreg, NULL);
2456 #ifdef ASSERT
2457   { Label L;
2458     __ cmpptr(Address(r15_thread,
2459                     JavaThread::last_Java_fp_offset()),
2460             (int32_t)0);
2461     __ jcc(Assembler::equal, L);
2462     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2463     __ bind(L);
2464   }
2465 #endif // ASSERT
2466   __ mov(c_rarg0, r15_thread);
2467   __ movl(c_rarg1, r14); // exec_mode
2468   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2469 
2470   // Need to have an oopmap that tells fetch_unroll_info where to
2471   // find any register it might need.
2472   oop_maps->add_gc_map(__ pc() - start, map);
2473 
2474   __ reset_last_Java_frame(false);
2475 
2476 #if INCLUDE_JVMCI
2477   if (EnableJVMCI) {
2478     __ bind(after_fetch_unroll_info_call);
2479   }
2480 #endif
2481 
2482   // Load UnrollBlock* into rdi
2483   __ mov(rdi, rax);
2484 
2485   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2486    Label noException;
2487   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2488   __ jcc(Assembler::notEqual, noException);
2489   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2490   // QQQ this is useless it was NULL above
2491   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2492   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2493   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2494 
2495   __ verify_oop(rax);
2496 
2497   // Overwrite the result registers with the exception results.
2498   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2499   // I think this is useless
2500   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2501 
2502   __ bind(noException);
2503 
2504   // Only register save data is on the stack.
2505   // Now restore the result registers.  Everything else is either dead
2506   // or captured in the vframeArray.
2507   RegisterSaver::restore_result_registers(masm);
2508 
2509   // All of the register save area has been popped of the stack. Only the
2510   // return address remains.
2511 
2512   // Pop all the frames we must move/replace.
2513   //
2514   // Frame picture (youngest to oldest)
2515   // 1: self-frame (no frame link)
2516   // 2: deopting frame  (no frame link)
2517   // 3: caller of deopting frame (could be compiled/interpreted).
2518   //
2519   // Note: by leaving the return address of self-frame on the stack
2520   // and using the size of frame 2 to adjust the stack
2521   // when we are done the return to frame 3 will still be on the stack.
2522 
2523   // Pop deoptimized frame
2524   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2525   __ addptr(rsp, rcx);
2526 
2527   // rsp should be pointing at the return address to the caller (3)
2528 
2529   // Pick up the initial fp we should save
2530   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2531   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2532 
2533 #ifdef ASSERT
2534   // Compilers generate code that bang the stack by as much as the
2535   // interpreter would need. So this stack banging should never
2536   // trigger a fault. Verify that it does not on non product builds.
2537   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2538   __ bang_stack_size(rbx, rcx);
2539 #endif
2540 
2541   // Load address of array of frame pcs into rcx
2542   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2543 
2544   // Trash the old pc
2545   __ addptr(rsp, wordSize);
2546 
2547   // Load address of array of frame sizes into rsi
2548   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2549 
2550   // Load counter into rdx
2551   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2552 
2553   // Now adjust the caller's stack to make up for the extra locals
2554   // but record the original sp so that we can save it in the skeletal interpreter
2555   // frame and the stack walking of interpreter_sender will get the unextended sp
2556   // value and not the "real" sp value.
2557 
2558   const Register sender_sp = r8;
2559 
2560   __ mov(sender_sp, rsp);
2561   __ movl(rbx, Address(rdi,
2562                        Deoptimization::UnrollBlock::
2563                        caller_adjustment_offset_in_bytes()));
2564   __ subptr(rsp, rbx);
2565 
2566   // Push interpreter frames in a loop
2567   Label loop;
2568   __ bind(loop);
2569   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2570   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2571   __ pushptr(Address(rcx, 0));          // Save return address
2572   __ enter();                           // Save old & set new ebp
2573   __ subptr(rsp, rbx);                  // Prolog
2574   // This value is corrected by layout_activation_impl
2575   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2576   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2577   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2578   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2579   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2580   __ decrementl(rdx);                   // Decrement counter
2581   __ jcc(Assembler::notZero, loop);
2582   __ pushptr(Address(rcx, 0));          // Save final return address
2583 
2584   // Re-push self-frame
2585   __ enter();                           // Save old & set new ebp
2586 
2587   // Allocate a full sized register save area.
2588   // Return address and rbp are in place, so we allocate two less words.
2589   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2590 
2591   // Restore frame locals after moving the frame
2592   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2593   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2594 
2595   // Call C code.  Need thread but NOT official VM entry
2596   // crud.  We cannot block on this call, no GC can happen.  Call should
2597   // restore return values to their stack-slots with the new SP.
2598   //
2599   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2600 
2601   // Use rbp because the frames look interpreted now
2602   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2603   // Don't need the precise return PC here, just precise enough to point into this code blob.
2604   address the_pc = __ pc();
2605   __ set_last_Java_frame(noreg, rbp, the_pc);
2606 
2607   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2608   __ mov(c_rarg0, r15_thread);
2609   __ movl(c_rarg1, r14); // second arg: exec_mode
2610   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2611   // Revert SP alignment after call since we're going to do some SP relative addressing below
2612   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2613 
2614   // Set an oopmap for the call site
2615   // Use the same PC we used for the last java frame
2616   oop_maps->add_gc_map(the_pc - start,
2617                        new OopMap( frame_size_in_words, 0 ));
2618 
2619   // Clear fp AND pc
2620   __ reset_last_Java_frame(true);
2621 
2622   // Collect return values
2623   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2624   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2625   // I think this is useless (throwing pc?)
2626   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2627 
2628   // Pop self-frame.
2629   __ leave();                           // Epilog
2630 
2631   // Jump to interpreter
2632   __ ret(0);
2633 
2634   // Make sure all code is generated
2635   masm->flush();
2636 
2637   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2638   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2639 #if INCLUDE_JVMCI
2640   if (EnableJVMCI) {
2641     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2642     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2643   }
2644 #endif
2645 }
2646 
2647 #ifdef COMPILER2
2648 //------------------------------generate_uncommon_trap_blob--------------------
2649 void SharedRuntime::generate_uncommon_trap_blob() {
2650   // Allocate space for the code
2651   ResourceMark rm;
2652   // Setup code generation tools
2653   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2654   MacroAssembler* masm = new MacroAssembler(&buffer);
2655 
2656   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2657 
2658   address start = __ pc();
2659 
2660   if (UseRTMLocking) {
2661     // Abort RTM transaction before possible nmethod deoptimization.
2662     __ xabort(0);
2663   }
2664 
2665   // Push self-frame.  We get here with a return address on the
2666   // stack, so rsp is 8-byte aligned until we allocate our frame.
2667   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2668 
2669   // No callee saved registers. rbp is assumed implicitly saved
2670   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2671 
2672   // compiler left unloaded_class_index in j_rarg0 move to where the
2673   // runtime expects it.
2674   __ movl(c_rarg1, j_rarg0);
2675 
2676   __ set_last_Java_frame(noreg, noreg, NULL);
2677 
2678   // Call C code.  Need thread but NOT official VM entry
2679   // crud.  We cannot block on this call, no GC can happen.  Call should
2680   // capture callee-saved registers as well as return values.
2681   // Thread is in rdi already.
2682   //
2683   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2684 
2685   __ mov(c_rarg0, r15_thread);
2686   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2687   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2688 
2689   // Set an oopmap for the call site
2690   OopMapSet* oop_maps = new OopMapSet();
2691   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2692 
2693   // location of rbp is known implicitly by the frame sender code
2694 
2695   oop_maps->add_gc_map(__ pc() - start, map);
2696 
2697   __ reset_last_Java_frame(false);
2698 
2699   // Load UnrollBlock* into rdi
2700   __ mov(rdi, rax);
2701 
2702 #ifdef ASSERT
2703   { Label L;
2704     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2705             (int32_t)Deoptimization::Unpack_uncommon_trap);
2706     __ jcc(Assembler::equal, L);
2707     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2708     __ bind(L);
2709   }
2710 #endif
2711 
2712   // Pop all the frames we must move/replace.
2713   //
2714   // Frame picture (youngest to oldest)
2715   // 1: self-frame (no frame link)
2716   // 2: deopting frame  (no frame link)
2717   // 3: caller of deopting frame (could be compiled/interpreted).
2718 
2719   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2720   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2721 
2722   // Pop deoptimized frame (int)
2723   __ movl(rcx, Address(rdi,
2724                        Deoptimization::UnrollBlock::
2725                        size_of_deoptimized_frame_offset_in_bytes()));
2726   __ addptr(rsp, rcx);
2727 
2728   // rsp should be pointing at the return address to the caller (3)
2729 
2730   // Pick up the initial fp we should save
2731   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2732   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2733 
2734 #ifdef ASSERT
2735   // Compilers generate code that bang the stack by as much as the
2736   // interpreter would need. So this stack banging should never
2737   // trigger a fault. Verify that it does not on non product builds.
2738   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2739   __ bang_stack_size(rbx, rcx);
2740 #endif
2741 
2742   // Load address of array of frame pcs into rcx (address*)
2743   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2744 
2745   // Trash the return pc
2746   __ addptr(rsp, wordSize);
2747 
2748   // Load address of array of frame sizes into rsi (intptr_t*)
2749   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2750 
2751   // Counter
2752   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2753 
2754   // Now adjust the caller's stack to make up for the extra locals but
2755   // record the original sp so that we can save it in the skeletal
2756   // interpreter frame and the stack walking of interpreter_sender
2757   // will get the unextended sp value and not the "real" sp value.
2758 
2759   const Register sender_sp = r8;
2760 
2761   __ mov(sender_sp, rsp);
2762   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2763   __ subptr(rsp, rbx);
2764 
2765   // Push interpreter frames in a loop
2766   Label loop;
2767   __ bind(loop);
2768   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2769   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2770   __ pushptr(Address(rcx, 0));     // Save return address
2771   __ enter();                      // Save old & set new rbp
2772   __ subptr(rsp, rbx);             // Prolog
2773   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2774             sender_sp);            // Make it walkable
2775   // This value is corrected by layout_activation_impl
2776   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2777   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
2778   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
2779   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
2780   __ decrementl(rdx);              // Decrement counter
2781   __ jcc(Assembler::notZero, loop);
2782   __ pushptr(Address(rcx, 0));     // Save final return address
2783 
2784   // Re-push self-frame
2785   __ enter();                 // Save old & set new rbp
2786   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2787                               // Prolog
2788 
2789   // Use rbp because the frames look interpreted now
2790   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2791   // Don't need the precise return PC here, just precise enough to point into this code blob.
2792   address the_pc = __ pc();
2793   __ set_last_Java_frame(noreg, rbp, the_pc);
2794 
2795   // Call C code.  Need thread but NOT official VM entry
2796   // crud.  We cannot block on this call, no GC can happen.  Call should
2797   // restore return values to their stack-slots with the new SP.
2798   // Thread is in rdi already.
2799   //
2800   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2801 
2802   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2803   __ mov(c_rarg0, r15_thread);
2804   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2805   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2806 
2807   // Set an oopmap for the call site
2808   // Use the same PC we used for the last java frame
2809   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2810 
2811   // Clear fp AND pc
2812   __ reset_last_Java_frame(true);
2813 
2814   // Pop self-frame.
2815   __ leave();                 // Epilog
2816 
2817   // Jump to interpreter
2818   __ ret(0);
2819 
2820   // Make sure all code is generated
2821   masm->flush();
2822 
2823   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
2824                                                  SimpleRuntimeFrame::framesize >> 1);
2825 }
2826 #endif // COMPILER2
2827 
2828 //------------------------------generate_handler_blob------
2829 //
2830 // Generate a special Compile2Runtime blob that saves all registers,
2831 // and setup oopmap.
2832 //
2833 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
2834   assert(StubRoutines::forward_exception_entry() != NULL,
2835          "must be generated before");
2836 
2837   ResourceMark rm;
2838   OopMapSet *oop_maps = new OopMapSet();
2839   OopMap* map;
2840 
2841   // Allocate space for the code.  Setup code generation tools.
2842   CodeBuffer buffer("handler_blob", 2048, 1024);
2843   MacroAssembler* masm = new MacroAssembler(&buffer);
2844 
2845   address start   = __ pc();
2846   address call_pc = NULL;
2847   int frame_size_in_words;
2848   bool cause_return = (poll_type == POLL_AT_RETURN);
2849   bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
2850 
2851   if (UseRTMLocking) {
2852     // Abort RTM transaction before calling runtime
2853     // because critical section will be large and will be
2854     // aborted anyway. Also nmethod could be deoptimized.
2855     __ xabort(0);
2856   }
2857 
2858   // Make room for return address (or push it again)
2859   if (!cause_return) {
2860     __ push(rbx);
2861   }
2862 
2863   // Save registers, fpu state, and flags
2864   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
2865 
2866   // The following is basically a call_VM.  However, we need the precise
2867   // address of the call in order to generate an oopmap. Hence, we do all the
2868   // work outselves.
2869 
2870   __ set_last_Java_frame(noreg, noreg, NULL);
2871 
2872   // The return address must always be correct so that frame constructor never
2873   // sees an invalid pc.
2874 
2875   if (!cause_return) {
2876     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2877     // Additionally, rbx is a callee saved register and we can look at it later to determine
2878     // if someone changed the return address for us!
2879     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
2880     __ movptr(Address(rbp, wordSize), rbx);
2881   }
2882 
2883   // Do the call
2884   __ mov(c_rarg0, r15_thread);
2885   __ call(RuntimeAddress(call_ptr));
2886 
2887   // Set an oopmap for the call site.  This oopmap will map all
2888   // oop-registers and debug-info registers as callee-saved.  This
2889   // will allow deoptimization at this safepoint to find all possible
2890   // debug-info recordings, as well as let GC find all oops.
2891 
2892   oop_maps->add_gc_map( __ pc() - start, map);
2893 
2894   Label noException;
2895 
2896   __ reset_last_Java_frame(false);
2897 
2898   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
2899   __ jcc(Assembler::equal, noException);
2900 
2901   // Exception pending
2902 
2903   RegisterSaver::restore_live_registers(masm, save_vectors);
2904 
2905   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2906 
2907   // No exception case
2908   __ bind(noException);
2909 
2910   Label no_adjust;
2911 #ifdef ASSERT
2912   Label bail;
2913 #endif
2914   if (!cause_return) {
2915     Label no_prefix, not_special;
2916 
2917     // If our stashed return pc was modified by the runtime we avoid touching it
2918     __ cmpptr(rbx, Address(rbp, wordSize));
2919     __ jccb(Assembler::notEqual, no_adjust);
2920 
2921     // Skip over the poll instruction.
2922     // See NativeInstruction::is_safepoint_poll()
2923     // Possible encodings:
2924     //      85 00       test   %eax,(%rax)
2925     //      85 01       test   %eax,(%rcx)
2926     //      85 02       test   %eax,(%rdx)
2927     //      85 03       test   %eax,(%rbx)
2928     //      85 06       test   %eax,(%rsi)
2929     //      85 07       test   %eax,(%rdi)
2930     //
2931     //   41 85 00       test   %eax,(%r8)
2932     //   41 85 01       test   %eax,(%r9)
2933     //   41 85 02       test   %eax,(%r10)
2934     //   41 85 03       test   %eax,(%r11)
2935     //   41 85 06       test   %eax,(%r14)
2936     //   41 85 07       test   %eax,(%r15)
2937     //
2938     //      85 04 24    test   %eax,(%rsp)
2939     //   41 85 04 24    test   %eax,(%r12)
2940     //      85 45 00    test   %eax,0x0(%rbp)
2941     //   41 85 45 00    test   %eax,0x0(%r13)
2942 
2943     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
2944     __ jcc(Assembler::notEqual, no_prefix);
2945     __ addptr(rbx, 1);
2946     __ bind(no_prefix);
2947 #ifdef ASSERT
2948     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
2949 #endif
2950     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
2951     // r12/rsp 0x04
2952     // r13/rbp 0x05
2953     __ movzbq(rcx, Address(rbx, 1));
2954     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
2955     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
2956     __ cmpptr(rcx, 1);
2957     __ jcc(Assembler::above, not_special);
2958     __ addptr(rbx, 1);
2959     __ bind(not_special);
2960 #ifdef ASSERT
2961     // Verify the correct encoding of the poll we're about to skip.
2962     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
2963     __ jcc(Assembler::notEqual, bail);
2964     // Mask out the modrm bits
2965     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
2966     // rax encodes to 0, so if the bits are nonzero it's incorrect
2967     __ jcc(Assembler::notZero, bail);
2968 #endif
2969     // Adjust return pc forward to step over the safepoint poll instruction
2970     __ addptr(rbx, 2);
2971     __ movptr(Address(rbp, wordSize), rbx);
2972   }
2973 
2974   __ bind(no_adjust);
2975   // Normal exit, restore registers and exit.
2976   RegisterSaver::restore_live_registers(masm, save_vectors);
2977   __ ret(0);
2978 
2979 #ifdef ASSERT
2980   __ bind(bail);
2981   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
2982 #endif
2983 
2984   // Make sure all code is generated
2985   masm->flush();
2986 
2987   // Fill-out other meta info
2988   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
2989 }
2990 
2991 //
2992 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
2993 //
2994 // Generate a stub that calls into vm to find out the proper destination
2995 // of a java call. All the argument registers are live at this point
2996 // but since this is generic code we don't know what they are and the caller
2997 // must do any gc of the args.
2998 //
2999 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3000   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3001 
3002   // allocate space for the code
3003   ResourceMark rm;
3004 
3005   CodeBuffer buffer(name, 1000, 512);
3006   MacroAssembler* masm                = new MacroAssembler(&buffer);
3007 
3008   int frame_size_in_words;
3009 
3010   OopMapSet *oop_maps = new OopMapSet();
3011   OopMap* map = NULL;
3012 
3013   int start = __ offset();
3014 
3015   // No need to save vector registers since they are caller-saved anyway.
3016   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
3017 
3018   int frame_complete = __ offset();
3019 
3020   __ set_last_Java_frame(noreg, noreg, NULL);
3021 
3022   __ mov(c_rarg0, r15_thread);
3023 
3024   __ call(RuntimeAddress(destination));
3025 
3026 
3027   // Set an oopmap for the call site.
3028   // We need this not only for callee-saved registers, but also for volatile
3029   // registers that the compiler might be keeping live across a safepoint.
3030 
3031   oop_maps->add_gc_map( __ offset() - start, map);
3032 
3033   // rax contains the address we are going to jump to assuming no exception got installed
3034 
3035   // clear last_Java_sp
3036   __ reset_last_Java_frame(false);
3037   // check for pending exceptions
3038   Label pending;
3039   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3040   __ jcc(Assembler::notEqual, pending);
3041 
3042   // get the returned Method*
3043   __ get_vm_result_2(rbx, r15_thread);
3044   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3045 
3046   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3047 
3048   RegisterSaver::restore_live_registers(masm);
3049 
3050   // We are back the the original state on entry and ready to go.
3051 
3052   __ jmp(rax);
3053 
3054   // Pending exception after the safepoint
3055 
3056   __ bind(pending);
3057 
3058   RegisterSaver::restore_live_registers(masm);
3059 
3060   // exception pending => remove activation and forward to exception handler
3061 
3062   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3063 
3064   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3065   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3066 
3067   // -------------
3068   // make sure all code is generated
3069   masm->flush();
3070 
3071   // return the  blob
3072   // frame_size_words or bytes??
3073   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3074 }
3075 
3076 #ifdef COMPILER2
3077 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3078 
3079 class NativeInvokerGenerator : public StubCodeGenerator {
3080   address _call_target;
3081   int _shadow_space_bytes;
3082 
3083   const GrowableArray<VMReg>& _input_registers;
3084   const GrowableArray<VMReg>& _output_registers;
3085 
3086   int _frame_complete;
3087   int _framesize;
3088   OopMapSet* _oop_maps;
3089 public:
3090   NativeInvokerGenerator(CodeBuffer* buffer,
3091                          address call_target,
3092                          int shadow_space_bytes,
3093                          const GrowableArray<VMReg>& input_registers,
3094                          const GrowableArray<VMReg>& output_registers)
3095    : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3096      _call_target(call_target),
3097      _shadow_space_bytes(shadow_space_bytes),
3098      _input_registers(input_registers),
3099      _output_registers(output_registers),
3100      _frame_complete(0),
3101      _framesize(0),
3102      _oop_maps(NULL) {
3103     assert(_output_registers.length() <= 1
3104            || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3105 
3106   }
3107 
3108   void generate();
3109 
3110   int spill_size_in_bytes() const {
3111     if (_output_registers.length() == 0) {
3112       return 0;
3113     }
3114     VMReg reg = _output_registers.at(0);
3115     assert(reg->is_reg(), "must be a register");
3116     if (reg->is_Register()) {
3117       return 8;
3118     } else if (reg->is_XMMRegister()) {
3119       if (UseAVX >= 3) {
3120         return 64;
3121       } else if (UseAVX >= 1) {
3122         return 32;
3123       } else {
3124         return 16;
3125       }
3126     } else {
3127       ShouldNotReachHere();
3128     }
3129     return 0;
3130   }
3131 
3132   void spill_out_registers() {
3133     if (_output_registers.length() == 0) {
3134       return;
3135     }
3136     VMReg reg = _output_registers.at(0);
3137     assert(reg->is_reg(), "must be a register");
3138     MacroAssembler* masm = _masm;
3139     if (reg->is_Register()) {
3140       __ movptr(Address(rsp, 0), reg->as_Register());
3141     } else if (reg->is_XMMRegister()) {
3142       if (UseAVX >= 3) {
3143         __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3144       } else if (UseAVX >= 1) {
3145         __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3146       } else {
3147         __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3148       }
3149     } else {
3150       ShouldNotReachHere();
3151     }
3152   }
3153 
3154   void fill_out_registers() {
3155     if (_output_registers.length() == 0) {
3156       return;
3157     }
3158     VMReg reg = _output_registers.at(0);
3159     assert(reg->is_reg(), "must be a register");
3160     MacroAssembler* masm = _masm;
3161     if (reg->is_Register()) {
3162       __ movptr(reg->as_Register(), Address(rsp, 0));
3163     } else if (reg->is_XMMRegister()) {
3164       if (UseAVX >= 3) {
3165         __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3166       } else if (UseAVX >= 1) {
3167         __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3168       } else {
3169         __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3170       }
3171     } else {
3172       ShouldNotReachHere();
3173     }
3174   }
3175 
3176   int frame_complete() const {
3177     return _frame_complete;
3178   }
3179 
3180   int framesize() const {
3181     return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3182   }
3183 
3184   OopMapSet* oop_maps() const {
3185     return _oop_maps;
3186   }
3187 
3188 private:
3189 #ifdef ASSERT
3190 bool target_uses_register(VMReg reg) {
3191   return _input_registers.contains(reg) || _output_registers.contains(reg);
3192 }
3193 #endif
3194 };
3195 
3196 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3197                                                 int shadow_space_bytes,
3198                                                 const GrowableArray<VMReg>& input_registers,
3199                                                 const GrowableArray<VMReg>& output_registers) {
3200   int locs_size  = 64;
3201   CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3202   NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3203   g.generate();
3204   code.log_section_sizes("nep_invoker_blob");
3205 
3206   RuntimeStub* stub =
3207     RuntimeStub::new_runtime_stub("nep_invoker_blob",
3208                                   &code,
3209                                   g.frame_complete(),
3210                                   g.framesize(),
3211                                   g.oop_maps(), false);
3212   return stub;
3213 }
3214 
3215 void NativeInvokerGenerator::generate() {
3216   assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3217 
3218   enum layout {
3219     rbp_off,
3220     rbp_off2,
3221     return_off,
3222     return_off2,
3223     framesize // inclusive of return address
3224   };
3225 
3226   _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3227   assert(is_even(_framesize/2), "sp not 16-byte aligned");
3228 
3229   _oop_maps  = new OopMapSet();
3230   MacroAssembler* masm = _masm;
3231 
3232   address start = __ pc();
3233 
3234   __ enter();
3235 
3236   // return address and rbp are already in place
3237   __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3238 
3239   _frame_complete = __ pc() - start;
3240 
3241   address the_pc = __ pc();
3242 
3243   __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3244   OopMap* map = new OopMap(_framesize, 0);
3245   _oop_maps->add_gc_map(the_pc - start, map);
3246 
3247   // State transition
3248   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3249 
3250   __ call(RuntimeAddress(_call_target));
3251 
3252   __ restore_cpu_control_state_after_jni();
3253 
3254   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3255 
3256   // Force this write out before the read below
3257   __ membar(Assembler::Membar_mask_bits(
3258           Assembler::LoadLoad | Assembler::LoadStore |
3259           Assembler::StoreLoad | Assembler::StoreStore));
3260 
3261   Label L_after_safepoint_poll;
3262   Label L_safepoint_poll_slow_path;
3263 
3264   __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3265   __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3266   __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3267 
3268   __ bind(L_after_safepoint_poll);
3269 
3270   // change thread state
3271   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3272 
3273   __ block_comment("reguard stack check");
3274   Label L_reguard;
3275   Label L_after_reguard;
3276   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3277   __ jcc(Assembler::equal, L_reguard);
3278   __ bind(L_after_reguard);
3279 
3280   __ reset_last_Java_frame(r15_thread, true);
3281 
3282   __ leave(); // required for proper stackwalking of RuntimeStub frame
3283   __ ret(0);
3284 
3285   //////////////////////////////////////////////////////////////////////////////
3286 
3287   __ block_comment("{ L_safepoint_poll_slow_path");
3288   __ bind(L_safepoint_poll_slow_path);
3289   __ vzeroupper();
3290 
3291   spill_out_registers();
3292 
3293   __ mov(c_rarg0, r15_thread);
3294   __ mov(r12, rsp); // remember sp
3295   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3296   __ andptr(rsp, -16); // align stack as required by ABI
3297   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3298   __ mov(rsp, r12); // restore sp
3299   __ reinit_heapbase();
3300 
3301   fill_out_registers();
3302 
3303   __ jmp(L_after_safepoint_poll);
3304   __ block_comment("} L_safepoint_poll_slow_path");
3305 
3306   //////////////////////////////////////////////////////////////////////////////
3307 
3308   __ block_comment("{ L_reguard");
3309   __ bind(L_reguard);
3310   __ vzeroupper();
3311 
3312   spill_out_registers();
3313 
3314   __ mov(r12, rsp); // remember sp
3315   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3316   __ andptr(rsp, -16); // align stack as required by ABI
3317   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3318   __ mov(rsp, r12); // restore sp
3319   __ reinit_heapbase();
3320 
3321   fill_out_registers();
3322 
3323   __ jmp(L_after_reguard);
3324 
3325   __ block_comment("} L_reguard");
3326 
3327   //////////////////////////////////////////////////////////////////////////////
3328 
3329   __ flush();
3330 }
3331 #endif // COMPILER2
3332 
3333 //------------------------------Montgomery multiplication------------------------
3334 //
3335 
3336 #ifndef _WINDOWS
3337 
3338 // Subtract 0:b from carry:a.  Return carry.
3339 static julong
3340 sub(julong a[], julong b[], julong carry, long len) {
3341   long long i = 0, cnt = len;
3342   julong tmp;
3343   asm volatile("clc; "
3344                "0: ; "
3345                "mov (%[b], %[i], 8), %[tmp]; "
3346                "sbb %[tmp], (%[a], %[i], 8); "
3347                "inc %[i]; dec %[cnt]; "
3348                "jne 0b; "
3349                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3350                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3351                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3352                : "memory");
3353   return tmp;
3354 }
3355 
3356 // Multiply (unsigned) Long A by Long B, accumulating the double-
3357 // length result into the accumulator formed of T0, T1, and T2.
3358 #define MACC(A, B, T0, T1, T2)                                  \
3359 do {                                                            \
3360   unsigned long hi, lo;                                         \
3361   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3362            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3363            : "r"(A), "a"(B) : "cc");                            \
3364  } while(0)
3365 
3366 // As above, but add twice the double-length result into the
3367 // accumulator.
3368 #define MACC2(A, B, T0, T1, T2)                                 \
3369 do {                                                            \
3370   unsigned long hi, lo;                                         \
3371   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3372            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3373            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3374            : "r"(A), "a"(B) : "cc");                            \
3375  } while(0)
3376 
3377 #else //_WINDOWS
3378 
3379 static julong
3380 sub(julong a[], julong b[], julong carry, long len) {
3381   long i;
3382   julong tmp;
3383   unsigned char c = 1;
3384   for (i = 0; i < len; i++) {
3385     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3386     a[i] = tmp;
3387   }
3388   c = _addcarry_u64(c, carry, ~0, &tmp);
3389   return tmp;
3390 }
3391 
3392 // Multiply (unsigned) Long A by Long B, accumulating the double-
3393 // length result into the accumulator formed of T0, T1, and T2.
3394 #define MACC(A, B, T0, T1, T2)                          \
3395 do {                                                    \
3396   julong hi, lo;                            \
3397   lo = _umul128(A, B, &hi);                             \
3398   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3399   c = _addcarry_u64(c, hi, T1, &T1);                    \
3400   _addcarry_u64(c, T2, 0, &T2);                         \
3401  } while(0)
3402 
3403 // As above, but add twice the double-length result into the
3404 // accumulator.
3405 #define MACC2(A, B, T0, T1, T2)                         \
3406 do {                                                    \
3407   julong hi, lo;                            \
3408   lo = _umul128(A, B, &hi);                             \
3409   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3410   c = _addcarry_u64(c, hi, T1, &T1);                    \
3411   _addcarry_u64(c, T2, 0, &T2);                         \
3412   c = _addcarry_u64(0, lo, T0, &T0);                    \
3413   c = _addcarry_u64(c, hi, T1, &T1);                    \
3414   _addcarry_u64(c, T2, 0, &T2);                         \
3415  } while(0)
3416 
3417 #endif //_WINDOWS
3418 
3419 // Fast Montgomery multiplication.  The derivation of the algorithm is
3420 // in  A Cryptographic Library for the Motorola DSP56000,
3421 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3422 
3423 static void NOINLINE
3424 montgomery_multiply(julong a[], julong b[], julong n[],
3425                     julong m[], julong inv, int len) {
3426   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3427   int i;
3428 
3429   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3430 
3431   for (i = 0; i < len; i++) {
3432     int j;
3433     for (j = 0; j < i; j++) {
3434       MACC(a[j], b[i-j], t0, t1, t2);
3435       MACC(m[j], n[i-j], t0, t1, t2);
3436     }
3437     MACC(a[i], b[0], t0, t1, t2);
3438     m[i] = t0 * inv;
3439     MACC(m[i], n[0], t0, t1, t2);
3440 
3441     assert(t0 == 0, "broken Montgomery multiply");
3442 
3443     t0 = t1; t1 = t2; t2 = 0;
3444   }
3445 
3446   for (i = len; i < 2*len; i++) {
3447     int j;
3448     for (j = i-len+1; j < len; j++) {
3449       MACC(a[j], b[i-j], t0, t1, t2);
3450       MACC(m[j], n[i-j], t0, t1, t2);
3451     }
3452     m[i-len] = t0;
3453     t0 = t1; t1 = t2; t2 = 0;
3454   }
3455 
3456   while (t0)
3457     t0 = sub(m, n, t0, len);
3458 }
3459 
3460 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3461 // multiplies so it should be up to 25% faster than Montgomery
3462 // multiplication.  However, its loop control is more complex and it
3463 // may actually run slower on some machines.
3464 
3465 static void NOINLINE
3466 montgomery_square(julong a[], julong n[],
3467                   julong m[], julong inv, int len) {
3468   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3469   int i;
3470 
3471   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3472 
3473   for (i = 0; i < len; i++) {
3474     int j;
3475     int end = (i+1)/2;
3476     for (j = 0; j < end; j++) {
3477       MACC2(a[j], a[i-j], t0, t1, t2);
3478       MACC(m[j], n[i-j], t0, t1, t2);
3479     }
3480     if ((i & 1) == 0) {
3481       MACC(a[j], a[j], t0, t1, t2);
3482     }
3483     for (; j < i; j++) {
3484       MACC(m[j], n[i-j], t0, t1, t2);
3485     }
3486     m[i] = t0 * inv;
3487     MACC(m[i], n[0], t0, t1, t2);
3488 
3489     assert(t0 == 0, "broken Montgomery square");
3490 
3491     t0 = t1; t1 = t2; t2 = 0;
3492   }
3493 
3494   for (i = len; i < 2*len; i++) {
3495     int start = i-len+1;
3496     int end = start + (len - start)/2;
3497     int j;
3498     for (j = start; j < end; j++) {
3499       MACC2(a[j], a[i-j], t0, t1, t2);
3500       MACC(m[j], n[i-j], t0, t1, t2);
3501     }
3502     if ((i & 1) == 0) {
3503       MACC(a[j], a[j], t0, t1, t2);
3504     }
3505     for (; j < len; j++) {
3506       MACC(m[j], n[i-j], t0, t1, t2);
3507     }
3508     m[i-len] = t0;
3509     t0 = t1; t1 = t2; t2 = 0;
3510   }
3511 
3512   while (t0)
3513     t0 = sub(m, n, t0, len);
3514 }
3515 
3516 // Swap words in a longword.
3517 static julong swap(julong x) {
3518   return (x << 32) | (x >> 32);
3519 }
3520 
3521 // Copy len longwords from s to d, word-swapping as we go.  The
3522 // destination array is reversed.
3523 static void reverse_words(julong *s, julong *d, int len) {
3524   d += len;
3525   while(len-- > 0) {
3526     d--;
3527     *d = swap(*s);
3528     s++;
3529   }
3530 }
3531 
3532 // The threshold at which squaring is advantageous was determined
3533 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3534 #define MONTGOMERY_SQUARING_THRESHOLD 64
3535 
3536 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3537                                         jint len, jlong inv,
3538                                         jint *m_ints) {
3539   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3540   int longwords = len/2;
3541 
3542   // Make very sure we don't use so much space that the stack might
3543   // overflow.  512 jints corresponds to an 16384-bit integer and
3544   // will use here a total of 8k bytes of stack space.
3545   int divisor = sizeof(julong) * 4;
3546   guarantee(longwords <= 8192 / divisor, "must be");
3547   int total_allocation = longwords * sizeof (julong) * 4;
3548   julong *scratch = (julong *)alloca(total_allocation);
3549 
3550   // Local scratch arrays
3551   julong
3552     *a = scratch + 0 * longwords,
3553     *b = scratch + 1 * longwords,
3554     *n = scratch + 2 * longwords,
3555     *m = scratch + 3 * longwords;
3556 
3557   reverse_words((julong *)a_ints, a, longwords);
3558   reverse_words((julong *)b_ints, b, longwords);
3559   reverse_words((julong *)n_ints, n, longwords);
3560 
3561   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3562 
3563   reverse_words(m, (julong *)m_ints, longwords);
3564 }
3565 
3566 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3567                                       jint len, jlong inv,
3568                                       jint *m_ints) {
3569   assert(len % 2 == 0, "array length in montgomery_square must be even");
3570   int longwords = len/2;
3571 
3572   // Make very sure we don't use so much space that the stack might
3573   // overflow.  512 jints corresponds to an 16384-bit integer and
3574   // will use here a total of 6k bytes of stack space.
3575   int divisor = sizeof(julong) * 3;
3576   guarantee(longwords <= (8192 / divisor), "must be");
3577   int total_allocation = longwords * sizeof (julong) * 3;
3578   julong *scratch = (julong *)alloca(total_allocation);
3579 
3580   // Local scratch arrays
3581   julong
3582     *a = scratch + 0 * longwords,
3583     *n = scratch + 1 * longwords,
3584     *m = scratch + 2 * longwords;
3585 
3586   reverse_words((julong *)a_ints, a, longwords);
3587   reverse_words((julong *)n_ints, n, longwords);
3588 
3589   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3590     ::montgomery_square(a, n, m, (julong)inv, longwords);
3591   } else {
3592     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3593   }
3594 
3595   reverse_words(m, (julong *)m_ints, longwords);
3596 }
3597 
3598 #ifdef COMPILER2
3599 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3600 //
3601 //------------------------------generate_exception_blob---------------------------
3602 // creates exception blob at the end
3603 // Using exception blob, this code is jumped from a compiled method.
3604 // (see emit_exception_handler in x86_64.ad file)
3605 //
3606 // Given an exception pc at a call we call into the runtime for the
3607 // handler in this method. This handler might merely restore state
3608 // (i.e. callee save registers) unwind the frame and jump to the
3609 // exception handler for the nmethod if there is no Java level handler
3610 // for the nmethod.
3611 //
3612 // This code is entered with a jmp.
3613 //
3614 // Arguments:
3615 //   rax: exception oop
3616 //   rdx: exception pc
3617 //
3618 // Results:
3619 //   rax: exception oop
3620 //   rdx: exception pc in caller or ???
3621 //   destination: exception handler of caller
3622 //
3623 // Note: the exception pc MUST be at a call (precise debug information)
3624 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3625 //
3626 
3627 void OptoRuntime::generate_exception_blob() {
3628   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3629   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3630   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3631 
3632   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3633 
3634   // Allocate space for the code
3635   ResourceMark rm;
3636   // Setup code generation tools
3637   CodeBuffer buffer("exception_blob", 2048, 1024);
3638   MacroAssembler* masm = new MacroAssembler(&buffer);
3639 
3640 
3641   address start = __ pc();
3642 
3643   // Exception pc is 'return address' for stack walker
3644   __ push(rdx);
3645   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3646 
3647   // Save callee-saved registers.  See x86_64.ad.
3648 
3649   // rbp is an implicitly saved callee saved register (i.e., the calling
3650   // convention will save/restore it in the prolog/epilog). Other than that
3651   // there are no callee save registers now that adapter frames are gone.
3652 
3653   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3654 
3655   // Store exception in Thread object. We cannot pass any arguments to the
3656   // handle_exception call, since we do not want to make any assumption
3657   // about the size of the frame where the exception happened in.
3658   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3659   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3660   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3661 
3662   // This call does all the hard work.  It checks if an exception handler
3663   // exists in the method.
3664   // If so, it returns the handler address.
3665   // If not, it prepares for stack-unwinding, restoring the callee-save
3666   // registers of the frame being removed.
3667   //
3668   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3669 
3670   // At a method handle call, the stack may not be properly aligned
3671   // when returning with an exception.
3672   address the_pc = __ pc();
3673   __ set_last_Java_frame(noreg, noreg, the_pc);
3674   __ mov(c_rarg0, r15_thread);
3675   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3676   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3677 
3678   // Set an oopmap for the call site.  This oopmap will only be used if we
3679   // are unwinding the stack.  Hence, all locations will be dead.
3680   // Callee-saved registers will be the same as the frame above (i.e.,
3681   // handle_exception_stub), since they were restored when we got the
3682   // exception.
3683 
3684   OopMapSet* oop_maps = new OopMapSet();
3685 
3686   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3687 
3688   __ reset_last_Java_frame(false);
3689 
3690   // Restore callee-saved registers
3691 
3692   // rbp is an implicitly saved callee-saved register (i.e., the calling
3693   // convention will save restore it in prolog/epilog) Other than that
3694   // there are no callee save registers now that adapter frames are gone.
3695 
3696   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3697 
3698   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3699   __ pop(rdx);                  // No need for exception pc anymore
3700 
3701   // rax: exception handler
3702 
3703   // We have a handler in rax (could be deopt blob).
3704   __ mov(r8, rax);
3705 
3706   // Get the exception oop
3707   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3708   // Get the exception pc in case we are deoptimized
3709   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3710 #ifdef ASSERT
3711   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3712   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3713 #endif
3714   // Clear the exception oop so GC no longer processes it as a root.
3715   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3716 
3717   // rax: exception oop
3718   // r8:  exception handler
3719   // rdx: exception pc
3720   // Jump to handler
3721 
3722   __ jmp(r8);
3723 
3724   // Make sure all code is generated
3725   masm->flush();
3726 
3727   // Set exception blob
3728   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3729 }
3730 #endif // COMPILER2
3731 
3732 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
3733                                        int total_in_args, const VMRegPair* in_regs,
3734                                        int total_out_args, VMRegPair* out_regs,
3735                                        GrowableArray<int>& arg_order,
3736                                        VMRegPair tmp_vmreg) {
3737   ComputeMoveOrder order(total_in_args, in_regs,
3738                          total_out_args, out_regs,
3739                          in_sig_bt, arg_order, tmp_vmreg);
3740 }