1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/icBuffer.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/compiledICHolder.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/jniHandles.hpp"
  48 #include "runtime/safepointMechanism.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/signature.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "runtime/vframeArray.hpp"
  53 #include "runtime/vm_version.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/formatBuffer.hpp"
  56 #include "vmreg_x86.inline.hpp"
  57 #ifdef COMPILER1
  58 #include "c1/c1_Runtime1.hpp"
  59 #endif
  60 #ifdef COMPILER2
  61 #include "opto/runtime.hpp"
  62 #endif
  63 #if INCLUDE_JVMCI
  64 #include "jvmci/jvmciJavaClasses.hpp"
  65 #endif
  66 
  67 #define __ masm->
  68 
  69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  70 
  71 class SimpleRuntimeFrame {
  72 
  73   public:
  74 
  75   // Most of the runtime stubs have this simple frame layout.
  76   // This class exists to make the layout shared in one place.
  77   // Offsets are for compiler stack slots, which are jints.
  78   enum layout {
  79     // The frame sender code expects that rbp will be in the "natural" place and
  80     // will override any oopMap setting for it. We must therefore force the layout
  81     // so that it agrees with the frame sender code.
  82     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  83     rbp_off2,
  84     return_off, return_off2,
  85     framesize
  86   };
  87 };
  88 
  89 class RegisterSaver {
  90   // Capture info about frame layout.  Layout offsets are in jint
  91   // units because compiler frame slots are jints.
  92 #define XSAVE_AREA_BEGIN 160
  93 #define XSAVE_AREA_YMM_BEGIN 576
  94 #define XSAVE_AREA_OPMASK_BEGIN 1088
  95 #define XSAVE_AREA_ZMM_BEGIN 1152
  96 #define XSAVE_AREA_UPPERBANK 1664
  97 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  98 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  99 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 100 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 102   enum layout {
 103     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 104     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 105     DEF_XMM_OFFS(0),
 106     DEF_XMM_OFFS(1),
 107     // 2..15 are implied in range usage
 108     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 109     DEF_YMM_OFFS(0),
 110     DEF_YMM_OFFS(1),
 111     // 2..15 are implied in range usage
 112     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 113     DEF_OPMASK_OFFS(0),
 114     DEF_OPMASK_OFFS(1),
 115     // 2..7 are implied in range usage
 116     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 117     DEF_ZMM_OFFS(0),
 118     DEF_ZMM_OFFS(1),
 119     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 120     DEF_ZMM_UPPER_OFFS(16),
 121     DEF_ZMM_UPPER_OFFS(17),
 122     // 18..31 are implied in range usage
 123     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 124     fpu_stateH_end,
 125     r15_off, r15H_off,
 126     r14_off, r14H_off,
 127     r13_off, r13H_off,
 128     r12_off, r12H_off,
 129     r11_off, r11H_off,
 130     r10_off, r10H_off,
 131     r9_off,  r9H_off,
 132     r8_off,  r8H_off,
 133     rdi_off, rdiH_off,
 134     rsi_off, rsiH_off,
 135     ignore_off, ignoreH_off,  // extra copy of rbp
 136     rsp_off, rspH_off,
 137     rbx_off, rbxH_off,
 138     rdx_off, rdxH_off,
 139     rcx_off, rcxH_off,
 140     rax_off, raxH_off,
 141     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 142     align_off, alignH_off,
 143     flags_off, flagsH_off,
 144     // The frame sender code expects that rbp will be in the "natural" place and
 145     // will override any oopMap setting for it. We must therefore force the layout
 146     // so that it agrees with the frame sender code.
 147     rbp_off, rbpH_off,        // copy of rbp we will restore
 148     return_off, returnH_off,  // slot for return address
 149     reg_save_size             // size in compiler stack slots
 150   };
 151 
 152  public:
 153   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors);
 154   static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 155 
 156   // Offsets into the register save area
 157   // Used by deoptimization when it is managing result register
 158   // values on its own
 159 
 160   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 161   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 162   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 163   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 164   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 165 
 166   // During deoptimization only the result registers need to be restored,
 167   // all the other values have already been extracted.
 168   static void restore_result_registers(MacroAssembler* masm);
 169 };
 170 
 171 // Register is a class, but it would be assigned numerical value.
 172 // "0" is assigned for rax. Thus we need to ignore -Wnonnull.
 173 PRAGMA_DIAG_PUSH
 174 PRAGMA_NONNULL_IGNORED
 175 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
 176   int off = 0;
 177   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 178   if (UseAVX < 3) {
 179     num_xmm_regs = num_xmm_regs/2;
 180   }
 181 #if COMPILER2_OR_JVMCI
 182   if (save_vectors && UseAVX == 0) {
 183     save_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 184   }
 185   assert(!save_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 186 #else
 187   save_vectors = false; // vectors are generated only by C2 and JVMCI
 188 #endif
 189 
 190   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 191   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 192   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 193   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 194   // CodeBlob frame size is in words.
 195   int frame_size_in_words = frame_size_in_bytes / wordSize;
 196   *total_frame_words = frame_size_in_words;
 197 
 198   // Save registers, fpu state, and flags.
 199   // We assume caller has already pushed the return address onto the
 200   // stack, so rsp is 8-byte aligned here.
 201   // We push rpb twice in this sequence because we want the real rbp
 202   // to be under the return like a normal enter.
 203 
 204   __ enter();          // rsp becomes 16-byte aligned here
 205   __ push_CPU_state(); // Push a multiple of 16 bytes
 206 
 207   // push cpu state handles this on EVEX enabled targets
 208   if (save_vectors) {
 209     // Save upper half of YMM registers(0..15)
 210     int base_addr = XSAVE_AREA_YMM_BEGIN;
 211     for (int n = 0; n < 16; n++) {
 212       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 213     }
 214     if (VM_Version::supports_evex()) {
 215       // Save upper half of ZMM registers(0..15)
 216       base_addr = XSAVE_AREA_ZMM_BEGIN;
 217       for (int n = 0; n < 16; n++) {
 218         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 219       }
 220       // Save full ZMM registers(16..num_xmm_regs)
 221       base_addr = XSAVE_AREA_UPPERBANK;
 222       off = 0;
 223       int vector_len = Assembler::AVX_512bit;
 224       for (int n = 16; n < num_xmm_regs; n++) {
 225         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 226       }
 227 #if COMPILER2_OR_JVMCI
 228       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 229       off = 0;
 230       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 231         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 232       }
 233 #endif
 234     }
 235   } else {
 236     if (VM_Version::supports_evex()) {
 237       // Save upper bank of ZMM registers(16..31) for double/float usage
 238       int base_addr = XSAVE_AREA_UPPERBANK;
 239       off = 0;
 240       for (int n = 16; n < num_xmm_regs; n++) {
 241         __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
 242       }
 243 #if COMPILER2_OR_JVMCI
 244       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 245       off = 0;
 246       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 247         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 248       }
 249 #endif
 250     }
 251   }
 252   __ vzeroupper();
 253   if (frame::arg_reg_save_area_bytes != 0) {
 254     // Allocate argument register save area
 255     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 256   }
 257 
 258   // Set an oopmap for the call site.  This oopmap will map all
 259   // oop-registers and debug-info registers as callee-saved.  This
 260   // will allow deoptimization at this safepoint to find all possible
 261   // debug-info recordings, as well as let GC find all oops.
 262 
 263   OopMapSet *oop_maps = new OopMapSet();
 264   OopMap* map = new OopMap(frame_size_in_slots, 0);
 265 
 266 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 267 
 268   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 271   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 272   // rbp location is known implicitly by the frame sender code, needs no oopmap
 273   // and the location where rbp was saved by is ignored
 274   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 283   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 284   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 285   // on EVEX enabled targets, we get it included in the xsave area
 286   off = xmm0_off;
 287   int delta = xmm1_off - off;
 288   for (int n = 0; n < 16; n++) {
 289     XMMRegister xmm_name = as_XMMRegister(n);
 290     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 291     off += delta;
 292   }
 293   if (UseAVX > 2) {
 294     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 295     off = zmm16_off;
 296     delta = zmm17_off - off;
 297     for (int n = 16; n < num_xmm_regs; n++) {
 298       XMMRegister zmm_name = as_XMMRegister(n);
 299       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 300       off += delta;
 301     }
 302   }
 303 
 304 #if COMPILER2_OR_JVMCI
 305   if (save_vectors) {
 306     // Save upper half of YMM registers(0..15)
 307     off = ymm0_off;
 308     delta = ymm1_off - ymm0_off;
 309     for (int n = 0; n < 16; n++) {
 310       XMMRegister ymm_name = as_XMMRegister(n);
 311       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 312       off += delta;
 313     }
 314     if (VM_Version::supports_evex()) {
 315       // Save upper half of ZMM registers(0..15)
 316       off = zmm0_off;
 317       delta = zmm1_off - zmm0_off;
 318       for (int n = 0; n < 16; n++) {
 319         XMMRegister zmm_name = as_XMMRegister(n);
 320         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 321         off += delta;
 322       }
 323     }
 324   }
 325 #endif // COMPILER2_OR_JVMCI
 326 
 327   // %%% These should all be a waste but we'll keep things as they were for now
 328   if (true) {
 329     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 333     // rbp location is known implicitly by the frame sender code, needs no oopmap
 334     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 343     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 344     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 345     // on EVEX enabled targets, we get it included in the xsave area
 346     off = xmm0H_off;
 347     delta = xmm1H_off - off;
 348     for (int n = 0; n < 16; n++) {
 349       XMMRegister xmm_name = as_XMMRegister(n);
 350       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 351       off += delta;
 352     }
 353     if (UseAVX > 2) {
 354       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 355       off = zmm16H_off;
 356       delta = zmm17H_off - off;
 357       for (int n = 16; n < num_xmm_regs; n++) {
 358         XMMRegister zmm_name = as_XMMRegister(n);
 359         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 360         off += delta;
 361       }
 362     }
 363   }
 364 
 365   return map;
 366 }
 367 PRAGMA_DIAG_POP
 368 
 369 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
 370   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 371   if (UseAVX < 3) {
 372     num_xmm_regs = num_xmm_regs/2;
 373   }
 374   if (frame::arg_reg_save_area_bytes != 0) {
 375     // Pop arg register save area
 376     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 377   }
 378 
 379 #if COMPILER2_OR_JVMCI
 380   if (restore_vectors) {
 381     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 382     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 383   }
 384 #else
 385   assert(!restore_vectors, "vectors are generated only by C2");
 386 #endif
 387 
 388   __ vzeroupper();
 389 
 390   // On EVEX enabled targets everything is handled in pop fpu state
 391   if (restore_vectors) {
 392     // Restore upper half of YMM registers (0..15)
 393     int base_addr = XSAVE_AREA_YMM_BEGIN;
 394     for (int n = 0; n < 16; n++) {
 395       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 396     }
 397     if (VM_Version::supports_evex()) {
 398       // Restore upper half of ZMM registers (0..15)
 399       base_addr = XSAVE_AREA_ZMM_BEGIN;
 400       for (int n = 0; n < 16; n++) {
 401         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 402       }
 403       // Restore full ZMM registers(16..num_xmm_regs)
 404       base_addr = XSAVE_AREA_UPPERBANK;
 405       int vector_len = Assembler::AVX_512bit;
 406       int off = 0;
 407       for (int n = 16; n < num_xmm_regs; n++) {
 408         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 409       }
 410 #if COMPILER2_OR_JVMCI
 411       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 412       off = 0;
 413       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 414         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 415       }
 416 #endif
 417     }
 418   } else {
 419     if (VM_Version::supports_evex()) {
 420       // Restore upper bank of ZMM registers(16..31) for double/float usage
 421       int base_addr = XSAVE_AREA_UPPERBANK;
 422       int off = 0;
 423       for (int n = 16; n < num_xmm_regs; n++) {
 424         __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
 425       }
 426 #if COMPILER2_OR_JVMCI
 427       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 428       off = 0;
 429       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 430         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 431       }
 432 #endif
 433     }
 434   }
 435 
 436   // Recover CPU state
 437   __ pop_CPU_state();
 438   // Get the rbp described implicitly by the calling convention (no oopMap)
 439   __ pop(rbp);
 440 }
 441 
 442 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 443 
 444   // Just restore result register. Only used by deoptimization. By
 445   // now any callee save register that needs to be restored to a c2
 446   // caller of the deoptee has been extracted into the vframeArray
 447   // and will be stuffed into the c2i adapter we create for later
 448   // restoration so only result registers need to be restored here.
 449 
 450   // Restore fp result register
 451   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 452   // Restore integer result register
 453   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 454   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 455 
 456   // Pop all of the register save are off the stack except the return address
 457   __ addptr(rsp, return_offset_in_bytes());
 458 }
 459 
 460 // Is vector's size (in bytes) bigger than a size saved by default?
 461 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 462 bool SharedRuntime::is_wide_vector(int size) {
 463   return size > 16;
 464 }
 465 
 466 // ---------------------------------------------------------------------------
 467 // Read the array of BasicTypes from a signature, and compute where the
 468 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 469 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 470 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 471 // as framesizes are fixed.
 472 // VMRegImpl::stack0 refers to the first slot 0(sp).
 473 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 474 // up to RegisterImpl::number_of_registers) are the 64-bit
 475 // integer registers.
 476 
 477 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 478 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 479 // units regardless of build. Of course for i486 there is no 64 bit build
 480 
 481 // The Java calling convention is a "shifted" version of the C ABI.
 482 // By skipping the first C ABI register we can call non-static jni methods
 483 // with small numbers of arguments without having to shuffle the arguments
 484 // at all. Since we control the java ABI we ought to at least get some
 485 // advantage out of it.
 486 
 487 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 488                                            VMRegPair *regs,
 489                                            int total_args_passed) {
 490 
 491   // Create the mapping between argument positions and
 492   // registers.
 493   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 494     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 495   };
 496   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 497     j_farg0, j_farg1, j_farg2, j_farg3,
 498     j_farg4, j_farg5, j_farg6, j_farg7
 499   };
 500 
 501 
 502   uint int_args = 0;
 503   uint fp_args = 0;
 504   uint stk_args = 0; // inc by 2 each time
 505 
 506   for (int i = 0; i < total_args_passed; i++) {
 507     switch (sig_bt[i]) {
 508     case T_BOOLEAN:
 509     case T_CHAR:
 510     case T_BYTE:
 511     case T_SHORT:
 512     case T_INT:
 513       if (int_args < Argument::n_int_register_parameters_j) {
 514         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 515       } else {
 516         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 517         stk_args += 2;
 518       }
 519       break;
 520     case T_VOID:
 521       // halves of T_LONG or T_DOUBLE
 522       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 523       regs[i].set_bad();
 524       break;
 525     case T_LONG:
 526       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 527       // fall through
 528     case T_OBJECT:
 529     case T_ARRAY:
 530     case T_ADDRESS:
 531       if (int_args < Argument::n_int_register_parameters_j) {
 532         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 533       } else {
 534         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 535         stk_args += 2;
 536       }
 537       break;
 538     case T_FLOAT:
 539       if (fp_args < Argument::n_float_register_parameters_j) {
 540         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 541       } else {
 542         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 543         stk_args += 2;
 544       }
 545       break;
 546     case T_DOUBLE:
 547       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 548       if (fp_args < Argument::n_float_register_parameters_j) {
 549         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 550       } else {
 551         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 552         stk_args += 2;
 553       }
 554       break;
 555     default:
 556       ShouldNotReachHere();
 557       break;
 558     }
 559   }
 560 
 561   return align_up(stk_args, 2);
 562 }
 563 
 564 // Patch the callers callsite with entry to compiled code if it exists.
 565 static void patch_callers_callsite(MacroAssembler *masm) {
 566   Label L;
 567   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 568   __ jcc(Assembler::equal, L);
 569 
 570   // Save the current stack pointer
 571   __ mov(r13, rsp);
 572   // Schedule the branch target address early.
 573   // Call into the VM to patch the caller, then jump to compiled callee
 574   // rax isn't live so capture return address while we easily can
 575   __ movptr(rax, Address(rsp, 0));
 576 
 577   // align stack so push_CPU_state doesn't fault
 578   __ andptr(rsp, -(StackAlignmentInBytes));
 579   __ push_CPU_state();
 580   __ vzeroupper();
 581   // VM needs caller's callsite
 582   // VM needs target method
 583   // This needs to be a long call since we will relocate this adapter to
 584   // the codeBuffer and it may not reach
 585 
 586   // Allocate argument register save area
 587   if (frame::arg_reg_save_area_bytes != 0) {
 588     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 589   }
 590   __ mov(c_rarg0, rbx);
 591   __ mov(c_rarg1, rax);
 592   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 593 
 594   // De-allocate argument register save area
 595   if (frame::arg_reg_save_area_bytes != 0) {
 596     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 597   }
 598 
 599   __ vzeroupper();
 600   __ pop_CPU_state();
 601   // restore sp
 602   __ mov(rsp, r13);
 603   __ bind(L);
 604 }
 605 
 606 
 607 static void gen_c2i_adapter(MacroAssembler *masm,
 608                             int total_args_passed,
 609                             int comp_args_on_stack,
 610                             const BasicType *sig_bt,
 611                             const VMRegPair *regs,
 612                             Label& skip_fixup) {
 613   // Before we get into the guts of the C2I adapter, see if we should be here
 614   // at all.  We've come from compiled code and are attempting to jump to the
 615   // interpreter, which means the caller made a static call to get here
 616   // (vcalls always get a compiled target if there is one).  Check for a
 617   // compiled target.  If there is one, we need to patch the caller's call.
 618   patch_callers_callsite(masm);
 619 
 620   __ bind(skip_fixup);
 621 
 622   // Since all args are passed on the stack, total_args_passed *
 623   // Interpreter::stackElementSize is the space we need. Plus 1 because
 624   // we also account for the return address location since
 625   // we store it first rather than hold it in rax across all the shuffling
 626 
 627   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 628 
 629   // stack is aligned, keep it that way
 630   extraspace = align_up(extraspace, 2*wordSize);
 631 
 632   // Get return address
 633   __ pop(rax);
 634 
 635   // set senderSP value
 636   __ mov(r13, rsp);
 637 
 638   __ subptr(rsp, extraspace);
 639 
 640   // Store the return address in the expected location
 641   __ movptr(Address(rsp, 0), rax);
 642 
 643   // Now write the args into the outgoing interpreter space
 644   for (int i = 0; i < total_args_passed; i++) {
 645     if (sig_bt[i] == T_VOID) {
 646       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 647       continue;
 648     }
 649 
 650     // offset to start parameters
 651     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 652     int next_off = st_off - Interpreter::stackElementSize;
 653 
 654     // Say 4 args:
 655     // i   st_off
 656     // 0   32 T_LONG
 657     // 1   24 T_VOID
 658     // 2   16 T_OBJECT
 659     // 3    8 T_BOOL
 660     // -    0 return address
 661     //
 662     // However to make thing extra confusing. Because we can fit a long/double in
 663     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 664     // leaves one slot empty and only stores to a single slot. In this case the
 665     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 666 
 667     VMReg r_1 = regs[i].first();
 668     VMReg r_2 = regs[i].second();
 669     if (!r_1->is_valid()) {
 670       assert(!r_2->is_valid(), "");
 671       continue;
 672     }
 673     if (r_1->is_stack()) {
 674       // memory to memory use rax
 675       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 676       if (!r_2->is_valid()) {
 677         // sign extend??
 678         __ movl(rax, Address(rsp, ld_off));
 679         __ movptr(Address(rsp, st_off), rax);
 680 
 681       } else {
 682 
 683         __ movq(rax, Address(rsp, ld_off));
 684 
 685         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 686         // T_DOUBLE and T_LONG use two slots in the interpreter
 687         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 688           // ld_off == LSW, ld_off+wordSize == MSW
 689           // st_off == MSW, next_off == LSW
 690           __ movq(Address(rsp, next_off), rax);
 691 #ifdef ASSERT
 692           // Overwrite the unused slot with known junk
 693           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 694           __ movptr(Address(rsp, st_off), rax);
 695 #endif /* ASSERT */
 696         } else {
 697           __ movq(Address(rsp, st_off), rax);
 698         }
 699       }
 700     } else if (r_1->is_Register()) {
 701       Register r = r_1->as_Register();
 702       if (!r_2->is_valid()) {
 703         // must be only an int (or less ) so move only 32bits to slot
 704         // why not sign extend??
 705         __ movl(Address(rsp, st_off), r);
 706       } else {
 707         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 708         // T_DOUBLE and T_LONG use two slots in the interpreter
 709         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 710           // long/double in gpr
 711 #ifdef ASSERT
 712           // Overwrite the unused slot with known junk
 713           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 714           __ movptr(Address(rsp, st_off), rax);
 715 #endif /* ASSERT */
 716           __ movq(Address(rsp, next_off), r);
 717         } else {
 718           __ movptr(Address(rsp, st_off), r);
 719         }
 720       }
 721     } else {
 722       assert(r_1->is_XMMRegister(), "");
 723       if (!r_2->is_valid()) {
 724         // only a float use just part of the slot
 725         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 726       } else {
 727 #ifdef ASSERT
 728         // Overwrite the unused slot with known junk
 729         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 730         __ movptr(Address(rsp, st_off), rax);
 731 #endif /* ASSERT */
 732         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 733       }
 734     }
 735   }
 736 
 737   // Schedule the branch target address early.
 738   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 739   __ jmp(rcx);
 740 }
 741 
 742 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 743                         address code_start, address code_end,
 744                         Label& L_ok) {
 745   Label L_fail;
 746   __ lea(temp_reg, ExternalAddress(code_start));
 747   __ cmpptr(pc_reg, temp_reg);
 748   __ jcc(Assembler::belowEqual, L_fail);
 749   __ lea(temp_reg, ExternalAddress(code_end));
 750   __ cmpptr(pc_reg, temp_reg);
 751   __ jcc(Assembler::below, L_ok);
 752   __ bind(L_fail);
 753 }
 754 
 755 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 756                                     int total_args_passed,
 757                                     int comp_args_on_stack,
 758                                     const BasicType *sig_bt,
 759                                     const VMRegPair *regs) {
 760 
 761   // Note: r13 contains the senderSP on entry. We must preserve it since
 762   // we may do a i2c -> c2i transition if we lose a race where compiled
 763   // code goes non-entrant while we get args ready.
 764   // In addition we use r13 to locate all the interpreter args as
 765   // we must align the stack to 16 bytes on an i2c entry else we
 766   // lose alignment we expect in all compiled code and register
 767   // save code can segv when fxsave instructions find improperly
 768   // aligned stack pointer.
 769 
 770   // Adapters can be frameless because they do not require the caller
 771   // to perform additional cleanup work, such as correcting the stack pointer.
 772   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 773   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 774   // even if a callee has modified the stack pointer.
 775   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 776   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 777   // up via the senderSP register).
 778   // In other words, if *either* the caller or callee is interpreted, we can
 779   // get the stack pointer repaired after a call.
 780   // This is why c2i and i2c adapters cannot be indefinitely composed.
 781   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 782   // both caller and callee would be compiled methods, and neither would
 783   // clean up the stack pointer changes performed by the two adapters.
 784   // If this happens, control eventually transfers back to the compiled
 785   // caller, but with an uncorrected stack, causing delayed havoc.
 786 
 787   // Pick up the return address
 788   __ movptr(rax, Address(rsp, 0));
 789 
 790   if (VerifyAdapterCalls &&
 791       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 792     // So, let's test for cascading c2i/i2c adapters right now.
 793     //  assert(Interpreter::contains($return_addr) ||
 794     //         StubRoutines::contains($return_addr),
 795     //         "i2c adapter must return to an interpreter frame");
 796     __ block_comment("verify_i2c { ");
 797     Label L_ok;
 798     if (Interpreter::code() != NULL)
 799       range_check(masm, rax, r11,
 800                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 801                   L_ok);
 802     if (StubRoutines::code1() != NULL)
 803       range_check(masm, rax, r11,
 804                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 805                   L_ok);
 806     if (StubRoutines::code2() != NULL)
 807       range_check(masm, rax, r11,
 808                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 809                   L_ok);
 810     const char* msg = "i2c adapter must return to an interpreter frame";
 811     __ block_comment(msg);
 812     __ stop(msg);
 813     __ bind(L_ok);
 814     __ block_comment("} verify_i2ce ");
 815   }
 816 
 817   // Must preserve original SP for loading incoming arguments because
 818   // we need to align the outgoing SP for compiled code.
 819   __ movptr(r11, rsp);
 820 
 821   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
 822   // in registers, we will occasionally have no stack args.
 823   int comp_words_on_stack = 0;
 824   if (comp_args_on_stack) {
 825     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
 826     // registers are below.  By subtracting stack0, we either get a negative
 827     // number (all values in registers) or the maximum stack slot accessed.
 828 
 829     // Convert 4-byte c2 stack slots to words.
 830     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 831     // Round up to miminum stack alignment, in wordSize
 832     comp_words_on_stack = align_up(comp_words_on_stack, 2);
 833     __ subptr(rsp, comp_words_on_stack * wordSize);
 834   }
 835 
 836 
 837   // Ensure compiled code always sees stack at proper alignment
 838   __ andptr(rsp, -16);
 839 
 840   // push the return address and misalign the stack that youngest frame always sees
 841   // as far as the placement of the call instruction
 842   __ push(rax);
 843 
 844   // Put saved SP in another register
 845   const Register saved_sp = rax;
 846   __ movptr(saved_sp, r11);
 847 
 848   // Will jump to the compiled code just as if compiled code was doing it.
 849   // Pre-load the register-jump target early, to schedule it better.
 850   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 851 
 852 #if INCLUDE_JVMCI
 853   if (EnableJVMCI) {
 854     // check if this call should be routed towards a specific entry point
 855     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 856     Label no_alternative_target;
 857     __ jcc(Assembler::equal, no_alternative_target);
 858     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 859     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 860     __ bind(no_alternative_target);
 861   }
 862 #endif // INCLUDE_JVMCI
 863 
 864   // Now generate the shuffle code.  Pick up all register args and move the
 865   // rest through the floating point stack top.
 866   for (int i = 0; i < total_args_passed; i++) {
 867     if (sig_bt[i] == T_VOID) {
 868       // Longs and doubles are passed in native word order, but misaligned
 869       // in the 32-bit build.
 870       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 871       continue;
 872     }
 873 
 874     // Pick up 0, 1 or 2 words from SP+offset.
 875 
 876     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 877             "scrambled load targets?");
 878     // Load in argument order going down.
 879     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 880     // Point to interpreter value (vs. tag)
 881     int next_off = ld_off - Interpreter::stackElementSize;
 882     //
 883     //
 884     //
 885     VMReg r_1 = regs[i].first();
 886     VMReg r_2 = regs[i].second();
 887     if (!r_1->is_valid()) {
 888       assert(!r_2->is_valid(), "");
 889       continue;
 890     }
 891     if (r_1->is_stack()) {
 892       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 893       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 894 
 895       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 896       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 897       // will be generated.
 898       if (!r_2->is_valid()) {
 899         // sign extend???
 900         __ movl(r13, Address(saved_sp, ld_off));
 901         __ movptr(Address(rsp, st_off), r13);
 902       } else {
 903         //
 904         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 905         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 906         // So we must adjust where to pick up the data to match the interpreter.
 907         //
 908         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 909         // are accessed as negative so LSW is at LOW address
 910 
 911         // ld_off is MSW so get LSW
 912         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 913                            next_off : ld_off;
 914         __ movq(r13, Address(saved_sp, offset));
 915         // st_off is LSW (i.e. reg.first())
 916         __ movq(Address(rsp, st_off), r13);
 917       }
 918     } else if (r_1->is_Register()) {  // Register argument
 919       Register r = r_1->as_Register();
 920       assert(r != rax, "must be different");
 921       if (r_2->is_valid()) {
 922         //
 923         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 924         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 925         // So we must adjust where to pick up the data to match the interpreter.
 926 
 927         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 928                            next_off : ld_off;
 929 
 930         // this can be a misaligned move
 931         __ movq(r, Address(saved_sp, offset));
 932       } else {
 933         // sign extend and use a full word?
 934         __ movl(r, Address(saved_sp, ld_off));
 935       }
 936     } else {
 937       if (!r_2->is_valid()) {
 938         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 939       } else {
 940         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 941       }
 942     }
 943   }
 944 
 945   // 6243940 We might end up in handle_wrong_method if
 946   // the callee is deoptimized as we race thru here. If that
 947   // happens we don't want to take a safepoint because the
 948   // caller frame will look interpreted and arguments are now
 949   // "compiled" so it is much better to make this transition
 950   // invisible to the stack walking code. Unfortunately if
 951   // we try and find the callee by normal means a safepoint
 952   // is possible. So we stash the desired callee in the thread
 953   // and the vm will find there should this case occur.
 954 
 955   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 956 
 957   // put Method* where a c2i would expect should we end up there
 958   // only needed becaus eof c2 resolve stubs return Method* as a result in
 959   // rax
 960   __ mov(rax, rbx);
 961   __ jmp(r11);
 962 }
 963 
 964 // ---------------------------------------------------------------
 965 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 966                                                             int total_args_passed,
 967                                                             int comp_args_on_stack,
 968                                                             const BasicType *sig_bt,
 969                                                             const VMRegPair *regs,
 970                                                             AdapterFingerPrint* fingerprint) {
 971   address i2c_entry = __ pc();
 972 
 973   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 974 
 975   // -------------------------------------------------------------------------
 976   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 977   // to the interpreter.  The args start out packed in the compiled layout.  They
 978   // need to be unpacked into the interpreter layout.  This will almost always
 979   // require some stack space.  We grow the current (compiled) stack, then repack
 980   // the args.  We  finally end in a jump to the generic interpreter entry point.
 981   // On exit from the interpreter, the interpreter will restore our SP (lest the
 982   // compiled code, which relys solely on SP and not RBP, get sick).
 983 
 984   address c2i_unverified_entry = __ pc();
 985   Label skip_fixup;
 986   Label ok;
 987 
 988   Register holder = rax;
 989   Register receiver = j_rarg0;
 990   Register temp = rbx;
 991 
 992   {
 993     __ load_klass(temp, receiver, rscratch1);
 994     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
 995     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
 996     __ jcc(Assembler::equal, ok);
 997     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 998 
 999     __ bind(ok);
1000     // Method might have been compiled since the call site was patched to
1001     // interpreted if that is the case treat it as a miss so we can get
1002     // the call site corrected.
1003     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1004     __ jcc(Assembler::equal, skip_fixup);
1005     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1006   }
1007 
1008   address c2i_entry = __ pc();
1009 
1010   // Class initialization barrier for static methods
1011   address c2i_no_clinit_check_entry = NULL;
1012   if (VM_Version::supports_fast_class_init_checks()) {
1013     Label L_skip_barrier;
1014     Register method = rbx;
1015 
1016     { // Bypass the barrier for non-static methods
1017       Register flags  = rscratch1;
1018       __ movl(flags, Address(method, Method::access_flags_offset()));
1019       __ testl(flags, JVM_ACC_STATIC);
1020       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1021     }
1022 
1023     Register klass = rscratch1;
1024     __ load_method_holder(klass, method);
1025     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1026 
1027     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1028 
1029     __ bind(L_skip_barrier);
1030     c2i_no_clinit_check_entry = __ pc();
1031   }
1032 
1033   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1034   bs->c2i_entry_barrier(masm);
1035 
1036   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1037 
1038   __ flush();
1039   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1040 }
1041 
1042 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1043                                          VMRegPair *regs,
1044                                          VMRegPair *regs2,
1045                                          int total_args_passed) {
1046   assert(regs2 == NULL, "not needed on x86");
1047 // We return the amount of VMRegImpl stack slots we need to reserve for all
1048 // the arguments NOT counting out_preserve_stack_slots.
1049 
1050 // NOTE: These arrays will have to change when c1 is ported
1051 #ifdef _WIN64
1052     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1053       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1054     };
1055     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1056       c_farg0, c_farg1, c_farg2, c_farg3
1057     };
1058 #else
1059     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1060       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1061     };
1062     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1063       c_farg0, c_farg1, c_farg2, c_farg3,
1064       c_farg4, c_farg5, c_farg6, c_farg7
1065     };
1066 #endif // _WIN64
1067 
1068 
1069     uint int_args = 0;
1070     uint fp_args = 0;
1071     uint stk_args = 0; // inc by 2 each time
1072 
1073     for (int i = 0; i < total_args_passed; i++) {
1074       switch (sig_bt[i]) {
1075       case T_BOOLEAN:
1076       case T_CHAR:
1077       case T_BYTE:
1078       case T_SHORT:
1079       case T_INT:
1080         if (int_args < Argument::n_int_register_parameters_c) {
1081           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1082 #ifdef _WIN64
1083           fp_args++;
1084           // Allocate slots for callee to stuff register args the stack.
1085           stk_args += 2;
1086 #endif
1087         } else {
1088           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1089           stk_args += 2;
1090         }
1091         break;
1092       case T_LONG:
1093         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1094         // fall through
1095       case T_OBJECT:
1096       case T_ARRAY:
1097       case T_ADDRESS:
1098       case T_METADATA:
1099         if (int_args < Argument::n_int_register_parameters_c) {
1100           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1101 #ifdef _WIN64
1102           fp_args++;
1103           stk_args += 2;
1104 #endif
1105         } else {
1106           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1107           stk_args += 2;
1108         }
1109         break;
1110       case T_FLOAT:
1111         if (fp_args < Argument::n_float_register_parameters_c) {
1112           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1113 #ifdef _WIN64
1114           int_args++;
1115           // Allocate slots for callee to stuff register args the stack.
1116           stk_args += 2;
1117 #endif
1118         } else {
1119           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1120           stk_args += 2;
1121         }
1122         break;
1123       case T_DOUBLE:
1124         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1125         if (fp_args < Argument::n_float_register_parameters_c) {
1126           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1127 #ifdef _WIN64
1128           int_args++;
1129           // Allocate slots for callee to stuff register args the stack.
1130           stk_args += 2;
1131 #endif
1132         } else {
1133           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1134           stk_args += 2;
1135         }
1136         break;
1137       case T_VOID: // Halves of longs and doubles
1138         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1139         regs[i].set_bad();
1140         break;
1141       default:
1142         ShouldNotReachHere();
1143         break;
1144       }
1145     }
1146 #ifdef _WIN64
1147   // windows abi requires that we always allocate enough stack space
1148   // for 4 64bit registers to be stored down.
1149   if (stk_args < 8) {
1150     stk_args = 8;
1151   }
1152 #endif // _WIN64
1153 
1154   return stk_args;
1155 }
1156 
1157 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1158                                              uint num_bits,
1159                                              uint total_args_passed) {
1160   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1161          "only certain vector sizes are supported for now");
1162 
1163   static const XMMRegister VEC_ArgReg[32] = {
1164      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1165      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1166     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1167     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1168   };
1169 
1170   uint stk_args = 0;
1171   uint fp_args = 0;
1172 
1173   for (uint i = 0; i < total_args_passed; i++) {
1174     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1175     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1176     regs[i].set_pair(vmreg->next(next_val), vmreg);
1177   }
1178 
1179   return stk_args;
1180 }
1181 
1182 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1183   // We always ignore the frame_slots arg and just use the space just below frame pointer
1184   // which by this time is free to use
1185   switch (ret_type) {
1186   case T_FLOAT:
1187     __ movflt(Address(rbp, -wordSize), xmm0);
1188     break;
1189   case T_DOUBLE:
1190     __ movdbl(Address(rbp, -wordSize), xmm0);
1191     break;
1192   case T_VOID:  break;
1193   default: {
1194     __ movptr(Address(rbp, -wordSize), rax);
1195     }
1196   }
1197 }
1198 
1199 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1200   // We always ignore the frame_slots arg and just use the space just below frame pointer
1201   // which by this time is free to use
1202   switch (ret_type) {
1203   case T_FLOAT:
1204     __ movflt(xmm0, Address(rbp, -wordSize));
1205     break;
1206   case T_DOUBLE:
1207     __ movdbl(xmm0, Address(rbp, -wordSize));
1208     break;
1209   case T_VOID:  break;
1210   default: {
1211     __ movptr(rax, Address(rbp, -wordSize));
1212     }
1213   }
1214 }
1215 
1216 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1217     for ( int i = first_arg ; i < arg_count ; i++ ) {
1218       if (args[i].first()->is_Register()) {
1219         __ push(args[i].first()->as_Register());
1220       } else if (args[i].first()->is_XMMRegister()) {
1221         __ subptr(rsp, 2*wordSize);
1222         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1223       }
1224     }
1225 }
1226 
1227 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1228     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1229       if (args[i].first()->is_Register()) {
1230         __ pop(args[i].first()->as_Register());
1231       } else if (args[i].first()->is_XMMRegister()) {
1232         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1233         __ addptr(rsp, 2*wordSize);
1234       }
1235     }
1236 }
1237 
1238 // Unpack an array argument into a pointer to the body and the length
1239 // if the array is non-null, otherwise pass 0 for both.
1240 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
1241   Register tmp_reg = rax;
1242   assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
1243          "possible collision");
1244   assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
1245          "possible collision");
1246 
1247   __ block_comment("unpack_array_argument {");
1248 
1249   // Pass the length, ptr pair
1250   Label is_null, done;
1251   VMRegPair tmp;
1252   tmp.set_ptr(tmp_reg->as_VMReg());
1253   if (reg.first()->is_stack()) {
1254     // Load the arg up from the stack
1255     __ move_ptr(reg, tmp);
1256     reg = tmp;
1257   }
1258   __ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1259   __ jccb(Assembler::equal, is_null);
1260   __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1261   __ move_ptr(tmp, body_arg);
1262   // load the length relative to the body.
1263   __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() -
1264                            arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1265   __ move32_64(tmp, length_arg);
1266   __ jmpb(done);
1267   __ bind(is_null);
1268   // Pass zeros
1269   __ xorptr(tmp_reg, tmp_reg);
1270   __ move_ptr(tmp, body_arg);
1271   __ move32_64(tmp, length_arg);
1272   __ bind(done);
1273 
1274   __ block_comment("} unpack_array_argument");
1275 }
1276 
1277 
1278 // Different signatures may require very different orders for the move
1279 // to avoid clobbering other arguments.  There's no simple way to
1280 // order them safely.  Compute a safe order for issuing stores and
1281 // break any cycles in those stores.  This code is fairly general but
1282 // it's not necessary on the other platforms so we keep it in the
1283 // platform dependent code instead of moving it into a shared file.
1284 // (See bugs 7013347 & 7145024.)
1285 // Note that this code is specific to LP64.
1286 class ComputeMoveOrder: public StackObj {
1287   class MoveOperation: public ResourceObj {
1288     friend class ComputeMoveOrder;
1289    private:
1290     VMRegPair        _src;
1291     VMRegPair        _dst;
1292     int              _src_index;
1293     int              _dst_index;
1294     bool             _processed;
1295     MoveOperation*  _next;
1296     MoveOperation*  _prev;
1297 
1298     static int get_id(VMRegPair r) {
1299       return r.first()->value();
1300     }
1301 
1302    public:
1303     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1304       _src(src)
1305     , _dst(dst)
1306     , _src_index(src_index)
1307     , _dst_index(dst_index)
1308     , _processed(false)
1309     , _next(NULL)
1310     , _prev(NULL) {
1311     }
1312 
1313     VMRegPair src() const              { return _src; }
1314     int src_id() const                 { return get_id(src()); }
1315     int src_index() const              { return _src_index; }
1316     VMRegPair dst() const              { return _dst; }
1317     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1318     int dst_index() const              { return _dst_index; }
1319     int dst_id() const                 { return get_id(dst()); }
1320     MoveOperation* next() const       { return _next; }
1321     MoveOperation* prev() const       { return _prev; }
1322     void set_processed()               { _processed = true; }
1323     bool is_processed() const          { return _processed; }
1324 
1325     // insert
1326     void break_cycle(VMRegPair temp_register) {
1327       // create a new store following the last store
1328       // to move from the temp_register to the original
1329       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1330 
1331       // break the cycle of links and insert new_store at the end
1332       // break the reverse link.
1333       MoveOperation* p = prev();
1334       assert(p->next() == this, "must be");
1335       _prev = NULL;
1336       p->_next = new_store;
1337       new_store->_prev = p;
1338 
1339       // change the original store to save it's value in the temp.
1340       set_dst(-1, temp_register);
1341     }
1342 
1343     void link(GrowableArray<MoveOperation*>& killer) {
1344       // link this store in front the store that it depends on
1345       MoveOperation* n = killer.at_grow(src_id(), NULL);
1346       if (n != NULL) {
1347         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1348         _next = n;
1349         n->_prev = this;
1350       }
1351     }
1352   };
1353 
1354  private:
1355   GrowableArray<MoveOperation*> edges;
1356 
1357  public:
1358   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1359                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1360     // Move operations where the dest is the stack can all be
1361     // scheduled first since they can't interfere with the other moves.
1362     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1363       if (in_sig_bt[i] == T_ARRAY) {
1364         c_arg--;
1365         if (out_regs[c_arg].first()->is_stack() &&
1366             out_regs[c_arg + 1].first()->is_stack()) {
1367           arg_order.push(i);
1368           arg_order.push(c_arg);
1369         } else {
1370           if (out_regs[c_arg].first()->is_stack() ||
1371               in_regs[i].first() == out_regs[c_arg].first()) {
1372             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1373           } else {
1374             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1375           }
1376         }
1377       } else if (in_sig_bt[i] == T_VOID) {
1378         arg_order.push(i);
1379         arg_order.push(c_arg);
1380       } else {
1381         if (out_regs[c_arg].first()->is_stack() ||
1382             in_regs[i].first() == out_regs[c_arg].first()) {
1383           arg_order.push(i);
1384           arg_order.push(c_arg);
1385         } else {
1386           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1387         }
1388       }
1389     }
1390     // Break any cycles in the register moves and emit the in the
1391     // proper order.
1392     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1393     for (int i = 0; i < stores->length(); i++) {
1394       arg_order.push(stores->at(i)->src_index());
1395       arg_order.push(stores->at(i)->dst_index());
1396     }
1397  }
1398 
1399   // Collected all the move operations
1400   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1401     if (src.first() == dst.first()) return;
1402     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1403   }
1404 
1405   // Walk the edges breaking cycles between moves.  The result list
1406   // can be walked in order to produce the proper set of loads
1407   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1408     // Record which moves kill which values
1409     GrowableArray<MoveOperation*> killer;
1410     for (int i = 0; i < edges.length(); i++) {
1411       MoveOperation* s = edges.at(i);
1412       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1413       killer.at_put_grow(s->dst_id(), s, NULL);
1414     }
1415     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1416            "make sure temp isn't in the registers that are killed");
1417 
1418     // create links between loads and stores
1419     for (int i = 0; i < edges.length(); i++) {
1420       edges.at(i)->link(killer);
1421     }
1422 
1423     // at this point, all the move operations are chained together
1424     // in a doubly linked list.  Processing it backwards finds
1425     // the beginning of the chain, forwards finds the end.  If there's
1426     // a cycle it can be broken at any point,  so pick an edge and walk
1427     // backward until the list ends or we end where we started.
1428     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1429     for (int e = 0; e < edges.length(); e++) {
1430       MoveOperation* s = edges.at(e);
1431       if (!s->is_processed()) {
1432         MoveOperation* start = s;
1433         // search for the beginning of the chain or cycle
1434         while (start->prev() != NULL && start->prev() != s) {
1435           start = start->prev();
1436         }
1437         if (start->prev() == s) {
1438           start->break_cycle(temp_register);
1439         }
1440         // walk the chain forward inserting to store list
1441         while (start != NULL) {
1442           stores->append(start);
1443           start->set_processed();
1444           start = start->next();
1445         }
1446       }
1447     }
1448     return stores;
1449   }
1450 };
1451 
1452 static void verify_oop_args(MacroAssembler* masm,
1453                             const methodHandle& method,
1454                             const BasicType* sig_bt,
1455                             const VMRegPair* regs) {
1456   Register temp_reg = rbx;  // not part of any compiled calling seq
1457   if (VerifyOops) {
1458     for (int i = 0; i < method->size_of_parameters(); i++) {
1459       if (is_reference_type(sig_bt[i])) {
1460         VMReg r = regs[i].first();
1461         assert(r->is_valid(), "bad oop arg");
1462         if (r->is_stack()) {
1463           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1464           __ verify_oop(temp_reg);
1465         } else {
1466           __ verify_oop(r->as_Register());
1467         }
1468       }
1469     }
1470   }
1471 }
1472 
1473 static void gen_special_dispatch(MacroAssembler* masm,
1474                                  const methodHandle& method,
1475                                  const BasicType* sig_bt,
1476                                  const VMRegPair* regs) {
1477   verify_oop_args(masm, method, sig_bt, regs);
1478   vmIntrinsics::ID iid = method->intrinsic_id();
1479 
1480   // Now write the args into the outgoing interpreter space
1481   bool     has_receiver   = false;
1482   Register receiver_reg   = noreg;
1483   int      member_arg_pos = -1;
1484   Register member_reg     = noreg;
1485   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1486   if (ref_kind != 0) {
1487     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1488     member_reg = rbx;  // known to be free at this point
1489     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1490   } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1491     has_receiver = true;
1492   } else {
1493     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1494   }
1495 
1496   if (member_reg != noreg) {
1497     // Load the member_arg into register, if necessary.
1498     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1499     VMReg r = regs[member_arg_pos].first();
1500     if (r->is_stack()) {
1501       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1502     } else {
1503       // no data motion is needed
1504       member_reg = r->as_Register();
1505     }
1506   }
1507 
1508   if (has_receiver) {
1509     // Make sure the receiver is loaded into a register.
1510     assert(method->size_of_parameters() > 0, "oob");
1511     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1512     VMReg r = regs[0].first();
1513     assert(r->is_valid(), "bad receiver arg");
1514     if (r->is_stack()) {
1515       // Porting note:  This assumes that compiled calling conventions always
1516       // pass the receiver oop in a register.  If this is not true on some
1517       // platform, pick a temp and load the receiver from stack.
1518       fatal("receiver always in a register");
1519       receiver_reg = j_rarg0;  // known to be free at this point
1520       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1521     } else {
1522       // no data motion is needed
1523       receiver_reg = r->as_Register();
1524     }
1525   }
1526 
1527   // Figure out which address we are really jumping to:
1528   MethodHandles::generate_method_handle_dispatch(masm, iid,
1529                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1530 }
1531 
1532 // ---------------------------------------------------------------------------
1533 // Generate a native wrapper for a given method.  The method takes arguments
1534 // in the Java compiled code convention, marshals them to the native
1535 // convention (handlizes oops, etc), transitions to native, makes the call,
1536 // returns to java state (possibly blocking), unhandlizes any result and
1537 // returns.
1538 //
1539 // Critical native functions are a shorthand for the use of
1540 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1541 // functions.  The wrapper is expected to unpack the arguments before
1542 // passing them to the callee. Critical native functions leave the state _in_Java,
1543 // since they cannot stop for GC.
1544 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1545 // block and the check for pending exceptions it's impossible for them
1546 // to be thrown.
1547 //
1548 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1549                                                 const methodHandle& method,
1550                                                 int compile_id,
1551                                                 BasicType* in_sig_bt,
1552                                                 VMRegPair* in_regs,
1553                                                 BasicType ret_type,
1554                                                 address critical_entry) {
1555   if (method->is_method_handle_intrinsic()) {
1556     vmIntrinsics::ID iid = method->intrinsic_id();
1557     intptr_t start = (intptr_t)__ pc();
1558     int vep_offset = ((intptr_t)__ pc()) - start;
1559     gen_special_dispatch(masm,
1560                          method,
1561                          in_sig_bt,
1562                          in_regs);
1563     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1564     __ flush();
1565     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1566     return nmethod::new_native_nmethod(method,
1567                                        compile_id,
1568                                        masm->code(),
1569                                        vep_offset,
1570                                        frame_complete,
1571                                        stack_slots / VMRegImpl::slots_per_word,
1572                                        in_ByteSize(-1),
1573                                        in_ByteSize(-1),
1574                                        (OopMapSet*)NULL);
1575   }
1576   bool is_critical_native = true;
1577   address native_func = critical_entry;
1578   if (native_func == NULL) {
1579     native_func = method->native_function();
1580     is_critical_native = false;
1581   }
1582   assert(native_func != NULL, "must have function");
1583 
1584   // An OopMap for lock (and class if static)
1585   OopMapSet *oop_maps = new OopMapSet();
1586   intptr_t start = (intptr_t)__ pc();
1587 
1588   // We have received a description of where all the java arg are located
1589   // on entry to the wrapper. We need to convert these args to where
1590   // the jni function will expect them. To figure out where they go
1591   // we convert the java signature to a C signature by inserting
1592   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1593 
1594   const int total_in_args = method->size_of_parameters();
1595   int total_c_args = total_in_args;
1596   if (!is_critical_native) {
1597     total_c_args += 1;
1598     if (method->is_static()) {
1599       total_c_args++;
1600     }
1601   } else {
1602     for (int i = 0; i < total_in_args; i++) {
1603       if (in_sig_bt[i] == T_ARRAY) {
1604         total_c_args++;
1605       }
1606     }
1607   }
1608 
1609   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1610   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1611   BasicType* in_elem_bt = NULL;
1612 
1613   int argc = 0;
1614   if (!is_critical_native) {
1615     out_sig_bt[argc++] = T_ADDRESS;
1616     if (method->is_static()) {
1617       out_sig_bt[argc++] = T_OBJECT;
1618     }
1619 
1620     for (int i = 0; i < total_in_args ; i++ ) {
1621       out_sig_bt[argc++] = in_sig_bt[i];
1622     }
1623   } else {
1624     in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
1625     SignatureStream ss(method->signature());
1626     for (int i = 0; i < total_in_args ; i++ ) {
1627       if (in_sig_bt[i] == T_ARRAY) {
1628         // Arrays are passed as int, elem* pair
1629         out_sig_bt[argc++] = T_INT;
1630         out_sig_bt[argc++] = T_ADDRESS;
1631         ss.skip_array_prefix(1);  // skip one '['
1632         assert(ss.is_primitive(), "primitive type expected");
1633         in_elem_bt[i] = ss.type();
1634       } else {
1635         out_sig_bt[argc++] = in_sig_bt[i];
1636         in_elem_bt[i] = T_VOID;
1637       }
1638       if (in_sig_bt[i] != T_VOID) {
1639         assert(in_sig_bt[i] == ss.type() ||
1640                in_sig_bt[i] == T_ARRAY, "must match");
1641         ss.next();
1642       }
1643     }
1644   }
1645 
1646   // Now figure out where the args must be stored and how much stack space
1647   // they require.
1648   int out_arg_slots;
1649   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1650 
1651   // Compute framesize for the wrapper.  We need to handlize all oops in
1652   // incoming registers
1653 
1654   // Calculate the total number of stack slots we will need.
1655 
1656   // First count the abi requirement plus all of the outgoing args
1657   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1658 
1659   // Now the space for the inbound oop handle area
1660   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1661   if (is_critical_native) {
1662     // Critical natives may have to call out so they need a save area
1663     // for register arguments.
1664     int double_slots = 0;
1665     int single_slots = 0;
1666     for ( int i = 0; i < total_in_args; i++) {
1667       if (in_regs[i].first()->is_Register()) {
1668         const Register reg = in_regs[i].first()->as_Register();
1669         switch (in_sig_bt[i]) {
1670           case T_BOOLEAN:
1671           case T_BYTE:
1672           case T_SHORT:
1673           case T_CHAR:
1674           case T_INT:  single_slots++; break;
1675           case T_ARRAY:  // specific to LP64 (7145024)
1676           case T_LONG: double_slots++; break;
1677           default:  ShouldNotReachHere();
1678         }
1679       } else if (in_regs[i].first()->is_XMMRegister()) {
1680         switch (in_sig_bt[i]) {
1681           case T_FLOAT:  single_slots++; break;
1682           case T_DOUBLE: double_slots++; break;
1683           default:  ShouldNotReachHere();
1684         }
1685       } else if (in_regs[i].first()->is_FloatRegister()) {
1686         ShouldNotReachHere();
1687       }
1688     }
1689     total_save_slots = double_slots * 2 + single_slots;
1690     // align the save area
1691     if (double_slots != 0) {
1692       stack_slots = align_up(stack_slots, 2);
1693     }
1694   }
1695 
1696   int oop_handle_offset = stack_slots;
1697   stack_slots += total_save_slots;
1698 
1699   // Now any space we need for handlizing a klass if static method
1700 
1701   int klass_slot_offset = 0;
1702   int klass_offset = -1;
1703   int lock_slot_offset = 0;
1704   bool is_static = false;
1705 
1706   if (method->is_static()) {
1707     klass_slot_offset = stack_slots;
1708     stack_slots += VMRegImpl::slots_per_word;
1709     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1710     is_static = true;
1711   }
1712 
1713   // Plus a lock if needed
1714 
1715   if (method->is_synchronized()) {
1716     lock_slot_offset = stack_slots;
1717     stack_slots += VMRegImpl::slots_per_word;
1718   }
1719 
1720   // Now a place (+2) to save return values or temp during shuffling
1721   // + 4 for return address (which we own) and saved rbp
1722   stack_slots += 6;
1723 
1724   // Ok The space we have allocated will look like:
1725   //
1726   //
1727   // FP-> |                     |
1728   //      |---------------------|
1729   //      | 2 slots for moves   |
1730   //      |---------------------|
1731   //      | lock box (if sync)  |
1732   //      |---------------------| <- lock_slot_offset
1733   //      | klass (if static)   |
1734   //      |---------------------| <- klass_slot_offset
1735   //      | oopHandle area      |
1736   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1737   //      | outbound memory     |
1738   //      | based arguments     |
1739   //      |                     |
1740   //      |---------------------|
1741   //      |                     |
1742   // SP-> | out_preserved_slots |
1743   //
1744   //
1745 
1746 
1747   // Now compute actual number of stack words we need rounding to make
1748   // stack properly aligned.
1749   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1750 
1751   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1752 
1753   // First thing make an ic check to see if we should even be here
1754 
1755   // We are free to use all registers as temps without saving them and
1756   // restoring them except rbp. rbp is the only callee save register
1757   // as far as the interpreter and the compiler(s) are concerned.
1758 
1759 
1760   const Register ic_reg = rax;
1761   const Register receiver = j_rarg0;
1762 
1763   Label hit;
1764   Label exception_pending;
1765 
1766   assert_different_registers(ic_reg, receiver, rscratch1);
1767   __ verify_oop(receiver);
1768   __ load_klass(rscratch1, receiver, rscratch2);
1769   __ cmpq(ic_reg, rscratch1);
1770   __ jcc(Assembler::equal, hit);
1771 
1772   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1773 
1774   // Verified entry point must be aligned
1775   __ align(8);
1776 
1777   __ bind(hit);
1778 
1779   int vep_offset = ((intptr_t)__ pc()) - start;
1780 
1781   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1782     Label L_skip_barrier;
1783     Register klass = r10;
1784     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1785     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1786 
1787     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1788 
1789     __ bind(L_skip_barrier);
1790   }
1791 
1792 #ifdef COMPILER1
1793   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1794   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1795     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1796   }
1797 #endif // COMPILER1
1798 
1799   // The instruction at the verified entry point must be 5 bytes or longer
1800   // because it can be patched on the fly by make_non_entrant. The stack bang
1801   // instruction fits that requirement.
1802 
1803   // Generate stack overflow check
1804   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1805 
1806   // Generate a new frame for the wrapper.
1807   __ enter();
1808   // -2 because return address is already present and so is saved rbp
1809   __ subptr(rsp, stack_size - 2*wordSize);
1810 
1811   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1812   bs->nmethod_entry_barrier(masm);
1813 
1814   // Frame is now completed as far as size and linkage.
1815   int frame_complete = ((intptr_t)__ pc()) - start;
1816 
1817     if (UseRTMLocking) {
1818       // Abort RTM transaction before calling JNI
1819       // because critical section will be large and will be
1820       // aborted anyway. Also nmethod could be deoptimized.
1821       __ xabort(0);
1822     }
1823 
1824 #ifdef ASSERT
1825     {
1826       Label L;
1827       __ mov(rax, rsp);
1828       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1829       __ cmpptr(rax, rsp);
1830       __ jcc(Assembler::equal, L);
1831       __ stop("improperly aligned stack");
1832       __ bind(L);
1833     }
1834 #endif /* ASSERT */
1835 
1836 
1837   // We use r14 as the oop handle for the receiver/klass
1838   // It is callee save so it survives the call to native
1839 
1840   const Register oop_handle_reg = r14;
1841 
1842   //
1843   // We immediately shuffle the arguments so that any vm call we have to
1844   // make from here on out (sync slow path, jvmti, etc.) we will have
1845   // captured the oops from our caller and have a valid oopMap for
1846   // them.
1847 
1848   // -----------------
1849   // The Grand Shuffle
1850 
1851   // The Java calling convention is either equal (linux) or denser (win64) than the
1852   // c calling convention. However the because of the jni_env argument the c calling
1853   // convention always has at least one more (and two for static) arguments than Java.
1854   // Therefore if we move the args from java -> c backwards then we will never have
1855   // a register->register conflict and we don't have to build a dependency graph
1856   // and figure out how to break any cycles.
1857   //
1858 
1859   // Record esp-based slot for receiver on stack for non-static methods
1860   int receiver_offset = -1;
1861 
1862   // This is a trick. We double the stack slots so we can claim
1863   // the oops in the caller's frame. Since we are sure to have
1864   // more args than the caller doubling is enough to make
1865   // sure we can capture all the incoming oop args from the
1866   // caller.
1867   //
1868   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1869 
1870   // Mark location of rbp (someday)
1871   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1872 
1873   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1874   // All inbound args are referenced based on rbp and all outbound args via rsp.
1875 
1876 
1877 #ifdef ASSERT
1878   bool reg_destroyed[RegisterImpl::number_of_registers];
1879   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1880   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1881     reg_destroyed[r] = false;
1882   }
1883   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1884     freg_destroyed[f] = false;
1885   }
1886 
1887 #endif /* ASSERT */
1888 
1889   // This may iterate in two different directions depending on the
1890   // kind of native it is.  The reason is that for regular JNI natives
1891   // the incoming and outgoing registers are offset upwards and for
1892   // critical natives they are offset down.
1893   GrowableArray<int> arg_order(2 * total_in_args);
1894 
1895   VMRegPair tmp_vmreg;
1896   tmp_vmreg.set2(rbx->as_VMReg());
1897 
1898   if (!is_critical_native) {
1899     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1900       arg_order.push(i);
1901       arg_order.push(c_arg);
1902     }
1903   } else {
1904     // Compute a valid move order, using tmp_vmreg to break any cycles
1905     ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
1906   }
1907 
1908   int temploc = -1;
1909   for (int ai = 0; ai < arg_order.length(); ai += 2) {
1910     int i = arg_order.at(ai);
1911     int c_arg = arg_order.at(ai + 1);
1912     __ block_comment(err_msg("move %d -> %d", i, c_arg));
1913     if (c_arg == -1) {
1914       assert(is_critical_native, "should only be required for critical natives");
1915       // This arg needs to be moved to a temporary
1916       __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
1917       in_regs[i] = tmp_vmreg;
1918       temploc = i;
1919       continue;
1920     } else if (i == -1) {
1921       assert(is_critical_native, "should only be required for critical natives");
1922       // Read from the temporary location
1923       assert(temploc != -1, "must be valid");
1924       i = temploc;
1925       temploc = -1;
1926     }
1927 #ifdef ASSERT
1928     if (in_regs[i].first()->is_Register()) {
1929       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1930     } else if (in_regs[i].first()->is_XMMRegister()) {
1931       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1932     }
1933     if (out_regs[c_arg].first()->is_Register()) {
1934       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1935     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1936       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1937     }
1938 #endif /* ASSERT */
1939     switch (in_sig_bt[i]) {
1940       case T_ARRAY:
1941         if (is_critical_native) {
1942           unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
1943           c_arg++;
1944 #ifdef ASSERT
1945           if (out_regs[c_arg].first()->is_Register()) {
1946             reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1947           } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1948             freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1949           }
1950 #endif
1951           break;
1952         }
1953       case T_OBJECT:
1954         assert(!is_critical_native, "no oop arguments");
1955         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1956                     ((i == 0) && (!is_static)),
1957                     &receiver_offset);
1958         break;
1959       case T_VOID:
1960         break;
1961 
1962       case T_FLOAT:
1963         __ float_move(in_regs[i], out_regs[c_arg]);
1964           break;
1965 
1966       case T_DOUBLE:
1967         assert( i + 1 < total_in_args &&
1968                 in_sig_bt[i + 1] == T_VOID &&
1969                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1970         __ double_move(in_regs[i], out_regs[c_arg]);
1971         break;
1972 
1973       case T_LONG :
1974         __ long_move(in_regs[i], out_regs[c_arg]);
1975         break;
1976 
1977       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1978 
1979       default:
1980         __ move32_64(in_regs[i], out_regs[c_arg]);
1981     }
1982   }
1983 
1984   int c_arg;
1985 
1986   // Pre-load a static method's oop into r14.  Used both by locking code and
1987   // the normal JNI call code.
1988   if (!is_critical_native) {
1989     // point c_arg at the first arg that is already loaded in case we
1990     // need to spill before we call out
1991     c_arg = total_c_args - total_in_args;
1992 
1993     if (method->is_static()) {
1994 
1995       //  load oop into a register
1996       __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1997 
1998       // Now handlize the static class mirror it's known not-null.
1999       __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2000       map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2001 
2002       // Now get the handle
2003       __ lea(oop_handle_reg, Address(rsp, klass_offset));
2004       // store the klass handle as second argument
2005       __ movptr(c_rarg1, oop_handle_reg);
2006       // and protect the arg if we must spill
2007       c_arg--;
2008     }
2009   } else {
2010     // For JNI critical methods we need to save all registers in save_args.
2011     c_arg = 0;
2012   }
2013 
2014   // Change state to native (we save the return address in the thread, since it might not
2015   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2016   // points into the right code segment. It does not have to be the correct return pc.
2017   // We use the same pc/oopMap repeatedly when we call out
2018 
2019   intptr_t the_pc = (intptr_t) __ pc();
2020   oop_maps->add_gc_map(the_pc - start, map);
2021 
2022   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
2023 
2024 
2025   // We have all of the arguments setup at this point. We must not touch any register
2026   // argument registers at this point (what if we save/restore them there are no oop?
2027 
2028   {
2029     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2030     // protect the args we've loaded
2031     save_args(masm, total_c_args, c_arg, out_regs);
2032     __ mov_metadata(c_rarg1, method());
2033     __ call_VM_leaf(
2034       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2035       r15_thread, c_rarg1);
2036     restore_args(masm, total_c_args, c_arg, out_regs);
2037   }
2038 
2039   // RedefineClasses() tracing support for obsolete method entry
2040   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2041     // protect the args we've loaded
2042     save_args(masm, total_c_args, c_arg, out_regs);
2043     __ mov_metadata(c_rarg1, method());
2044     __ call_VM_leaf(
2045       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2046       r15_thread, c_rarg1);
2047     restore_args(masm, total_c_args, c_arg, out_regs);
2048   }
2049 
2050   // Lock a synchronized method
2051 
2052   // Register definitions used by locking and unlocking
2053 
2054   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2055   const Register obj_reg  = rbx;  // Will contain the oop
2056   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2057   const Register old_hdr  = r13;  // value of old header at unlock time
2058 
2059   Label slow_path_lock;
2060   Label lock_done;
2061 
2062   if (method->is_synchronized()) {
2063     assert(!is_critical_native, "unhandled");
2064 
2065 
2066     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2067 
2068     // Get the handle (the 2nd argument)
2069     __ mov(oop_handle_reg, c_rarg1);
2070 
2071     // Get address of the box
2072 
2073     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2074 
2075     // Load the oop from the handle
2076     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2077 
2078     // Load immediate 1 into swap_reg %rax
2079     __ movl(swap_reg, 1);
2080 
2081     // Load (object->mark() | 1) into swap_reg %rax
2082     __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2083 
2084     // Save (object->mark() | 1) into BasicLock's displaced header
2085     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2086 
2087     // src -> dest iff dest == rax else rax <- dest
2088     __ lock();
2089     __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2090     __ jcc(Assembler::equal, lock_done);
2091 
2092     // Hmm should this move to the slow path code area???
2093 
2094     // Test if the oopMark is an obvious stack pointer, i.e.,
2095     //  1) (mark & 3) == 0, and
2096     //  2) rsp <= mark < mark + os::pagesize()
2097     // These 3 tests can be done by evaluating the following
2098     // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2099     // assuming both stack pointer and pagesize have their
2100     // least significant 2 bits clear.
2101     // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2102 
2103     __ subptr(swap_reg, rsp);
2104     __ andptr(swap_reg, 3 - os::vm_page_size());
2105 
2106     // Save the test result, for recursive case, the result is zero
2107     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2108     __ jcc(Assembler::notEqual, slow_path_lock);
2109 
2110     // Slow path will re-enter here
2111 
2112     __ bind(lock_done);
2113   }
2114 
2115   // Finally just about ready to make the JNI call
2116 
2117   // get JNIEnv* which is first argument to native
2118   if (!is_critical_native) {
2119     __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2120 
2121     // Now set thread in native
2122     __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2123   }
2124 
2125   __ call(RuntimeAddress(native_func));
2126 
2127   // Verify or restore cpu control state after JNI call
2128   __ restore_cpu_control_state_after_jni();
2129 
2130   // Unpack native results.
2131   switch (ret_type) {
2132   case T_BOOLEAN: __ c2bool(rax);            break;
2133   case T_CHAR   : __ movzwl(rax, rax);      break;
2134   case T_BYTE   : __ sign_extend_byte (rax); break;
2135   case T_SHORT  : __ sign_extend_short(rax); break;
2136   case T_INT    : /* nothing to do */        break;
2137   case T_DOUBLE :
2138   case T_FLOAT  :
2139     // Result is in xmm0 we'll save as needed
2140     break;
2141   case T_ARRAY:                 // Really a handle
2142   case T_OBJECT:                // Really a handle
2143       break; // can't de-handlize until after safepoint check
2144   case T_VOID: break;
2145   case T_LONG: break;
2146   default       : ShouldNotReachHere();
2147   }
2148 
2149   Label after_transition;
2150 
2151   // If this is a critical native, check for a safepoint or suspend request after the call.
2152   // If a safepoint is needed, transition to native, then to native_trans to handle
2153   // safepoints like the native methods that are not critical natives.
2154   if (is_critical_native) {
2155     Label needs_safepoint;
2156     __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */);
2157     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2158     __ jcc(Assembler::equal, after_transition);
2159     __ bind(needs_safepoint);
2160   }
2161 
2162   // Switch thread to "native transition" state before reading the synchronization state.
2163   // This additional state is necessary because reading and testing the synchronization
2164   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2165   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2166   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2167   //     Thread A is resumed to finish this native method, but doesn't block here since it
2168   //     didn't see any synchronization is progress, and escapes.
2169   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2170 
2171   // Force this write out before the read below
2172   __ membar(Assembler::Membar_mask_bits(
2173               Assembler::LoadLoad | Assembler::LoadStore |
2174               Assembler::StoreLoad | Assembler::StoreStore));
2175 
2176   // check for safepoint operation in progress and/or pending suspend requests
2177   {
2178     Label Continue;
2179     Label slow_path;
2180 
2181     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2182 
2183     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2184     __ jcc(Assembler::equal, Continue);
2185     __ bind(slow_path);
2186 
2187     // Don't use call_VM as it will see a possible pending exception and forward it
2188     // and never return here preventing us from clearing _last_native_pc down below.
2189     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2190     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2191     // by hand.
2192     //
2193     __ vzeroupper();
2194     save_native_result(masm, ret_type, stack_slots);
2195     __ mov(c_rarg0, r15_thread);
2196     __ mov(r12, rsp); // remember sp
2197     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2198     __ andptr(rsp, -16); // align stack as required by ABI
2199     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2200     __ mov(rsp, r12); // restore sp
2201     __ reinit_heapbase();
2202     // Restore any method result value
2203     restore_native_result(masm, ret_type, stack_slots);
2204     __ bind(Continue);
2205   }
2206 
2207   // change thread state
2208   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2209   __ bind(after_transition);
2210 
2211   Label reguard;
2212   Label reguard_done;
2213   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2214   __ jcc(Assembler::equal, reguard);
2215   __ bind(reguard_done);
2216 
2217   // native result if any is live
2218 
2219   // Unlock
2220   Label unlock_done;
2221   Label slow_path_unlock;
2222   if (method->is_synchronized()) {
2223 
2224     // Get locked oop from the handle we passed to jni
2225     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2226 
2227     Label done;
2228     // Simple recursive lock?
2229 
2230     __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2231     __ jcc(Assembler::equal, done);
2232 
2233     // Must save rax if if it is live now because cmpxchg must use it
2234     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2235       save_native_result(masm, ret_type, stack_slots);
2236     }
2237 
2238 
2239     // get address of the stack lock
2240     __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2241     //  get old displaced header
2242     __ movptr(old_hdr, Address(rax, 0));
2243 
2244     // Atomic swap old header if oop still contains the stack lock
2245     __ lock();
2246     __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2247     __ jcc(Assembler::notEqual, slow_path_unlock);
2248 
2249     // slow path re-enters here
2250     __ bind(unlock_done);
2251     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2252       restore_native_result(masm, ret_type, stack_slots);
2253     }
2254 
2255     __ bind(done);
2256 
2257   }
2258   {
2259     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2260     save_native_result(masm, ret_type, stack_slots);
2261     __ mov_metadata(c_rarg1, method());
2262     __ call_VM_leaf(
2263          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2264          r15_thread, c_rarg1);
2265     restore_native_result(masm, ret_type, stack_slots);
2266   }
2267 
2268   __ reset_last_Java_frame(false);
2269 
2270   // Unbox oop result, e.g. JNIHandles::resolve value.
2271   if (is_reference_type(ret_type)) {
2272     __ resolve_jobject(rax /* value */,
2273                        r15_thread /* thread */,
2274                        rcx /* tmp */);
2275   }
2276 
2277   if (CheckJNICalls) {
2278     // clear_pending_jni_exception_check
2279     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2280   }
2281 
2282   if (!is_critical_native) {
2283     // reset handle block
2284     __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2285     __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2286   }
2287 
2288   // pop our frame
2289 
2290   __ leave();
2291 
2292   if (!is_critical_native) {
2293     // Any exception pending?
2294     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2295     __ jcc(Assembler::notEqual, exception_pending);
2296   }
2297 
2298   // Return
2299 
2300   __ ret(0);
2301 
2302   // Unexpected paths are out of line and go here
2303 
2304   if (!is_critical_native) {
2305     // forward the exception
2306     __ bind(exception_pending);
2307 
2308     // and forward the exception
2309     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2310   }
2311 
2312   // Slow path locking & unlocking
2313   if (method->is_synchronized()) {
2314 
2315     // BEGIN Slow path lock
2316     __ bind(slow_path_lock);
2317 
2318     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2319     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2320 
2321     // protect the args we've loaded
2322     save_args(masm, total_c_args, c_arg, out_regs);
2323 
2324     __ mov(c_rarg0, obj_reg);
2325     __ mov(c_rarg1, lock_reg);
2326     __ mov(c_rarg2, r15_thread);
2327 
2328     // Not a leaf but we have last_Java_frame setup as we want
2329     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2330     restore_args(masm, total_c_args, c_arg, out_regs);
2331 
2332 #ifdef ASSERT
2333     { Label L;
2334     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2335     __ jcc(Assembler::equal, L);
2336     __ stop("no pending exception allowed on exit from monitorenter");
2337     __ bind(L);
2338     }
2339 #endif
2340     __ jmp(lock_done);
2341 
2342     // END Slow path lock
2343 
2344     // BEGIN Slow path unlock
2345     __ bind(slow_path_unlock);
2346 
2347     // If we haven't already saved the native result we must save it now as xmm registers
2348     // are still exposed.
2349     __ vzeroupper();
2350     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2351       save_native_result(masm, ret_type, stack_slots);
2352     }
2353 
2354     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2355 
2356     __ mov(c_rarg0, obj_reg);
2357     __ mov(c_rarg2, r15_thread);
2358     __ mov(r12, rsp); // remember sp
2359     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2360     __ andptr(rsp, -16); // align stack as required by ABI
2361 
2362     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2363     // NOTE that obj_reg == rbx currently
2364     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2365     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2366 
2367     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2368     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2369     __ mov(rsp, r12); // restore sp
2370     __ reinit_heapbase();
2371 #ifdef ASSERT
2372     {
2373       Label L;
2374       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2375       __ jcc(Assembler::equal, L);
2376       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2377       __ bind(L);
2378     }
2379 #endif /* ASSERT */
2380 
2381     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2382 
2383     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2384       restore_native_result(masm, ret_type, stack_slots);
2385     }
2386     __ jmp(unlock_done);
2387 
2388     // END Slow path unlock
2389 
2390   } // synchronized
2391 
2392   // SLOW PATH Reguard the stack if needed
2393 
2394   __ bind(reguard);
2395   __ vzeroupper();
2396   save_native_result(masm, ret_type, stack_slots);
2397   __ mov(r12, rsp); // remember sp
2398   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2399   __ andptr(rsp, -16); // align stack as required by ABI
2400   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2401   __ mov(rsp, r12); // restore sp
2402   __ reinit_heapbase();
2403   restore_native_result(masm, ret_type, stack_slots);
2404   // and continue
2405   __ jmp(reguard_done);
2406 
2407 
2408 
2409   __ flush();
2410 
2411   nmethod *nm = nmethod::new_native_nmethod(method,
2412                                             compile_id,
2413                                             masm->code(),
2414                                             vep_offset,
2415                                             frame_complete,
2416                                             stack_slots / VMRegImpl::slots_per_word,
2417                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2418                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2419                                             oop_maps);
2420 
2421   return nm;
2422 }
2423 
2424 // this function returns the adjust size (in number of words) to a c2i adapter
2425 // activation for use during deoptimization
2426 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2427   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2428 }
2429 
2430 
2431 uint SharedRuntime::out_preserve_stack_slots() {
2432   return 0;
2433 }
2434 
2435 
2436 // Number of stack slots between incoming argument block and the start of
2437 // a new frame.  The PROLOG must add this many slots to the stack.  The
2438 // EPILOG must remove this many slots.  amd64 needs two slots for
2439 // return address.
2440 uint SharedRuntime::in_preserve_stack_slots() {
2441   return 4 + 2 * VerifyStackAtCalls;
2442 }
2443 
2444 //------------------------------generate_deopt_blob----------------------------
2445 void SharedRuntime::generate_deopt_blob() {
2446   // Allocate space for the code
2447   ResourceMark rm;
2448   // Setup code generation tools
2449   int pad = 0;
2450   if (UseAVX > 2) {
2451     pad += 1024;
2452   }
2453 #if INCLUDE_JVMCI
2454   if (EnableJVMCI) {
2455     pad += 512; // Increase the buffer size when compiling for JVMCI
2456   }
2457 #endif
2458   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2459   MacroAssembler* masm = new MacroAssembler(&buffer);
2460   int frame_size_in_words;
2461   OopMap* map = NULL;
2462   OopMapSet *oop_maps = new OopMapSet();
2463 
2464   // -------------
2465   // This code enters when returning to a de-optimized nmethod.  A return
2466   // address has been pushed on the the stack, and return values are in
2467   // registers.
2468   // If we are doing a normal deopt then we were called from the patched
2469   // nmethod from the point we returned to the nmethod. So the return
2470   // address on the stack is wrong by NativeCall::instruction_size
2471   // We will adjust the value so it looks like we have the original return
2472   // address on the stack (like when we eagerly deoptimized).
2473   // In the case of an exception pending when deoptimizing, we enter
2474   // with a return address on the stack that points after the call we patched
2475   // into the exception handler. We have the following register state from,
2476   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2477   //    rax: exception oop
2478   //    rbx: exception handler
2479   //    rdx: throwing pc
2480   // So in this case we simply jam rdx into the useless return address and
2481   // the stack looks just like we want.
2482   //
2483   // At this point we need to de-opt.  We save the argument return
2484   // registers.  We call the first C routine, fetch_unroll_info().  This
2485   // routine captures the return values and returns a structure which
2486   // describes the current frame size and the sizes of all replacement frames.
2487   // The current frame is compiled code and may contain many inlined
2488   // functions, each with their own JVM state.  We pop the current frame, then
2489   // push all the new frames.  Then we call the C routine unpack_frames() to
2490   // populate these frames.  Finally unpack_frames() returns us the new target
2491   // address.  Notice that callee-save registers are BLOWN here; they have
2492   // already been captured in the vframeArray at the time the return PC was
2493   // patched.
2494   address start = __ pc();
2495   Label cont;
2496 
2497   // Prolog for non exception case!
2498 
2499   // Save everything in sight.
2500   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2501 
2502   // Normal deoptimization.  Save exec mode for unpack_frames.
2503   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2504   __ jmp(cont);
2505 
2506   int reexecute_offset = __ pc() - start;
2507 #if INCLUDE_JVMCI && !defined(COMPILER1)
2508   if (EnableJVMCI && UseJVMCICompiler) {
2509     // JVMCI does not use this kind of deoptimization
2510     __ should_not_reach_here();
2511   }
2512 #endif
2513 
2514   // Reexecute case
2515   // return address is the pc describes what bci to do re-execute at
2516 
2517   // No need to update map as each call to save_live_registers will produce identical oopmap
2518   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2519 
2520   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2521   __ jmp(cont);
2522 
2523 #if INCLUDE_JVMCI
2524   Label after_fetch_unroll_info_call;
2525   int implicit_exception_uncommon_trap_offset = 0;
2526   int uncommon_trap_offset = 0;
2527 
2528   if (EnableJVMCI) {
2529     implicit_exception_uncommon_trap_offset = __ pc() - start;
2530 
2531     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2532     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2533 
2534     uncommon_trap_offset = __ pc() - start;
2535 
2536     // Save everything in sight.
2537     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2538     // fetch_unroll_info needs to call last_java_frame()
2539     __ set_last_Java_frame(noreg, noreg, NULL);
2540 
2541     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2542     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2543 
2544     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2545     __ mov(c_rarg0, r15_thread);
2546     __ movl(c_rarg2, r14); // exec mode
2547     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2548     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2549 
2550     __ reset_last_Java_frame(false);
2551 
2552     __ jmp(after_fetch_unroll_info_call);
2553   } // EnableJVMCI
2554 #endif // INCLUDE_JVMCI
2555 
2556   int exception_offset = __ pc() - start;
2557 
2558   // Prolog for exception case
2559 
2560   // all registers are dead at this entry point, except for rax, and
2561   // rdx which contain the exception oop and exception pc
2562   // respectively.  Set them in TLS and fall thru to the
2563   // unpack_with_exception_in_tls entry point.
2564 
2565   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2566   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2567 
2568   int exception_in_tls_offset = __ pc() - start;
2569 
2570   // new implementation because exception oop is now passed in JavaThread
2571 
2572   // Prolog for exception case
2573   // All registers must be preserved because they might be used by LinearScan
2574   // Exceptiop oop and throwing PC are passed in JavaThread
2575   // tos: stack at point of call to method that threw the exception (i.e. only
2576   // args are on the stack, no return address)
2577 
2578   // make room on stack for the return address
2579   // It will be patched later with the throwing pc. The correct value is not
2580   // available now because loading it from memory would destroy registers.
2581   __ push(0);
2582 
2583   // Save everything in sight.
2584   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2585 
2586   // Now it is safe to overwrite any register
2587 
2588   // Deopt during an exception.  Save exec mode for unpack_frames.
2589   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2590 
2591   // load throwing pc from JavaThread and patch it as the return address
2592   // of the current frame. Then clear the field in JavaThread
2593 
2594   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2595   __ movptr(Address(rbp, wordSize), rdx);
2596   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2597 
2598 #ifdef ASSERT
2599   // verify that there is really an exception oop in JavaThread
2600   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2601   __ verify_oop(rax);
2602 
2603   // verify that there is no pending exception
2604   Label no_pending_exception;
2605   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2606   __ testptr(rax, rax);
2607   __ jcc(Assembler::zero, no_pending_exception);
2608   __ stop("must not have pending exception here");
2609   __ bind(no_pending_exception);
2610 #endif
2611 
2612   __ bind(cont);
2613 
2614   // Call C code.  Need thread and this frame, but NOT official VM entry
2615   // crud.  We cannot block on this call, no GC can happen.
2616   //
2617   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2618 
2619   // fetch_unroll_info needs to call last_java_frame().
2620 
2621   __ set_last_Java_frame(noreg, noreg, NULL);
2622 #ifdef ASSERT
2623   { Label L;
2624     __ cmpptr(Address(r15_thread,
2625                     JavaThread::last_Java_fp_offset()),
2626             (int32_t)0);
2627     __ jcc(Assembler::equal, L);
2628     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2629     __ bind(L);
2630   }
2631 #endif // ASSERT
2632   __ mov(c_rarg0, r15_thread);
2633   __ movl(c_rarg1, r14); // exec_mode
2634   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2635 
2636   // Need to have an oopmap that tells fetch_unroll_info where to
2637   // find any register it might need.
2638   oop_maps->add_gc_map(__ pc() - start, map);
2639 
2640   __ reset_last_Java_frame(false);
2641 
2642 #if INCLUDE_JVMCI
2643   if (EnableJVMCI) {
2644     __ bind(after_fetch_unroll_info_call);
2645   }
2646 #endif
2647 
2648   // Load UnrollBlock* into rdi
2649   __ mov(rdi, rax);
2650 
2651   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2652    Label noException;
2653   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2654   __ jcc(Assembler::notEqual, noException);
2655   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2656   // QQQ this is useless it was NULL above
2657   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2658   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2659   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2660 
2661   __ verify_oop(rax);
2662 
2663   // Overwrite the result registers with the exception results.
2664   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2665   // I think this is useless
2666   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2667 
2668   __ bind(noException);
2669 
2670   // Only register save data is on the stack.
2671   // Now restore the result registers.  Everything else is either dead
2672   // or captured in the vframeArray.
2673   RegisterSaver::restore_result_registers(masm);
2674 
2675   // All of the register save area has been popped of the stack. Only the
2676   // return address remains.
2677 
2678   // Pop all the frames we must move/replace.
2679   //
2680   // Frame picture (youngest to oldest)
2681   // 1: self-frame (no frame link)
2682   // 2: deopting frame  (no frame link)
2683   // 3: caller of deopting frame (could be compiled/interpreted).
2684   //
2685   // Note: by leaving the return address of self-frame on the stack
2686   // and using the size of frame 2 to adjust the stack
2687   // when we are done the return to frame 3 will still be on the stack.
2688 
2689   // Pop deoptimized frame
2690   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2691   __ addptr(rsp, rcx);
2692 
2693   // rsp should be pointing at the return address to the caller (3)
2694 
2695   // Pick up the initial fp we should save
2696   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2697   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2698 
2699 #ifdef ASSERT
2700   // Compilers generate code that bang the stack by as much as the
2701   // interpreter would need. So this stack banging should never
2702   // trigger a fault. Verify that it does not on non product builds.
2703   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2704   __ bang_stack_size(rbx, rcx);
2705 #endif
2706 
2707   // Load address of array of frame pcs into rcx
2708   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2709 
2710   // Trash the old pc
2711   __ addptr(rsp, wordSize);
2712 
2713   // Load address of array of frame sizes into rsi
2714   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2715 
2716   // Load counter into rdx
2717   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2718 
2719   // Now adjust the caller's stack to make up for the extra locals
2720   // but record the original sp so that we can save it in the skeletal interpreter
2721   // frame and the stack walking of interpreter_sender will get the unextended sp
2722   // value and not the "real" sp value.
2723 
2724   const Register sender_sp = r8;
2725 
2726   __ mov(sender_sp, rsp);
2727   __ movl(rbx, Address(rdi,
2728                        Deoptimization::UnrollBlock::
2729                        caller_adjustment_offset_in_bytes()));
2730   __ subptr(rsp, rbx);
2731 
2732   // Push interpreter frames in a loop
2733   Label loop;
2734   __ bind(loop);
2735   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2736   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2737   __ pushptr(Address(rcx, 0));          // Save return address
2738   __ enter();                           // Save old & set new ebp
2739   __ subptr(rsp, rbx);                  // Prolog
2740   // This value is corrected by layout_activation_impl
2741   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2742   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2743   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2744   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2745   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2746   __ decrementl(rdx);                   // Decrement counter
2747   __ jcc(Assembler::notZero, loop);
2748   __ pushptr(Address(rcx, 0));          // Save final return address
2749 
2750   // Re-push self-frame
2751   __ enter();                           // Save old & set new ebp
2752 
2753   // Allocate a full sized register save area.
2754   // Return address and rbp are in place, so we allocate two less words.
2755   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2756 
2757   // Restore frame locals after moving the frame
2758   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2759   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2760 
2761   // Call C code.  Need thread but NOT official VM entry
2762   // crud.  We cannot block on this call, no GC can happen.  Call should
2763   // restore return values to their stack-slots with the new SP.
2764   //
2765   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2766 
2767   // Use rbp because the frames look interpreted now
2768   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2769   // Don't need the precise return PC here, just precise enough to point into this code blob.
2770   address the_pc = __ pc();
2771   __ set_last_Java_frame(noreg, rbp, the_pc);
2772 
2773   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2774   __ mov(c_rarg0, r15_thread);
2775   __ movl(c_rarg1, r14); // second arg: exec_mode
2776   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2777   // Revert SP alignment after call since we're going to do some SP relative addressing below
2778   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2779 
2780   // Set an oopmap for the call site
2781   // Use the same PC we used for the last java frame
2782   oop_maps->add_gc_map(the_pc - start,
2783                        new OopMap( frame_size_in_words, 0 ));
2784 
2785   // Clear fp AND pc
2786   __ reset_last_Java_frame(true);
2787 
2788   // Collect return values
2789   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2790   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2791   // I think this is useless (throwing pc?)
2792   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2793 
2794   // Pop self-frame.
2795   __ leave();                           // Epilog
2796 
2797   // Jump to interpreter
2798   __ ret(0);
2799 
2800   // Make sure all code is generated
2801   masm->flush();
2802 
2803   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2804   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2805 #if INCLUDE_JVMCI
2806   if (EnableJVMCI) {
2807     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2808     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2809   }
2810 #endif
2811 }
2812 
2813 #ifdef COMPILER2
2814 //------------------------------generate_uncommon_trap_blob--------------------
2815 void SharedRuntime::generate_uncommon_trap_blob() {
2816   // Allocate space for the code
2817   ResourceMark rm;
2818   // Setup code generation tools
2819   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2820   MacroAssembler* masm = new MacroAssembler(&buffer);
2821 
2822   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2823 
2824   address start = __ pc();
2825 
2826   if (UseRTMLocking) {
2827     // Abort RTM transaction before possible nmethod deoptimization.
2828     __ xabort(0);
2829   }
2830 
2831   // Push self-frame.  We get here with a return address on the
2832   // stack, so rsp is 8-byte aligned until we allocate our frame.
2833   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2834 
2835   // No callee saved registers. rbp is assumed implicitly saved
2836   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2837 
2838   // compiler left unloaded_class_index in j_rarg0 move to where the
2839   // runtime expects it.
2840   __ movl(c_rarg1, j_rarg0);
2841 
2842   __ set_last_Java_frame(noreg, noreg, NULL);
2843 
2844   // Call C code.  Need thread but NOT official VM entry
2845   // crud.  We cannot block on this call, no GC can happen.  Call should
2846   // capture callee-saved registers as well as return values.
2847   // Thread is in rdi already.
2848   //
2849   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2850 
2851   __ mov(c_rarg0, r15_thread);
2852   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2853   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2854 
2855   // Set an oopmap for the call site
2856   OopMapSet* oop_maps = new OopMapSet();
2857   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2858 
2859   // location of rbp is known implicitly by the frame sender code
2860 
2861   oop_maps->add_gc_map(__ pc() - start, map);
2862 
2863   __ reset_last_Java_frame(false);
2864 
2865   // Load UnrollBlock* into rdi
2866   __ mov(rdi, rax);
2867 
2868 #ifdef ASSERT
2869   { Label L;
2870     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2871             (int32_t)Deoptimization::Unpack_uncommon_trap);
2872     __ jcc(Assembler::equal, L);
2873     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2874     __ bind(L);
2875   }
2876 #endif
2877 
2878   // Pop all the frames we must move/replace.
2879   //
2880   // Frame picture (youngest to oldest)
2881   // 1: self-frame (no frame link)
2882   // 2: deopting frame  (no frame link)
2883   // 3: caller of deopting frame (could be compiled/interpreted).
2884 
2885   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2886   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2887 
2888   // Pop deoptimized frame (int)
2889   __ movl(rcx, Address(rdi,
2890                        Deoptimization::UnrollBlock::
2891                        size_of_deoptimized_frame_offset_in_bytes()));
2892   __ addptr(rsp, rcx);
2893 
2894   // rsp should be pointing at the return address to the caller (3)
2895 
2896   // Pick up the initial fp we should save
2897   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2898   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2899 
2900 #ifdef ASSERT
2901   // Compilers generate code that bang the stack by as much as the
2902   // interpreter would need. So this stack banging should never
2903   // trigger a fault. Verify that it does not on non product builds.
2904   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2905   __ bang_stack_size(rbx, rcx);
2906 #endif
2907 
2908   // Load address of array of frame pcs into rcx (address*)
2909   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2910 
2911   // Trash the return pc
2912   __ addptr(rsp, wordSize);
2913 
2914   // Load address of array of frame sizes into rsi (intptr_t*)
2915   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2916 
2917   // Counter
2918   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2919 
2920   // Now adjust the caller's stack to make up for the extra locals but
2921   // record the original sp so that we can save it in the skeletal
2922   // interpreter frame and the stack walking of interpreter_sender
2923   // will get the unextended sp value and not the "real" sp value.
2924 
2925   const Register sender_sp = r8;
2926 
2927   __ mov(sender_sp, rsp);
2928   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2929   __ subptr(rsp, rbx);
2930 
2931   // Push interpreter frames in a loop
2932   Label loop;
2933   __ bind(loop);
2934   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2935   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2936   __ pushptr(Address(rcx, 0));     // Save return address
2937   __ enter();                      // Save old & set new rbp
2938   __ subptr(rsp, rbx);             // Prolog
2939   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2940             sender_sp);            // Make it walkable
2941   // This value is corrected by layout_activation_impl
2942   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2943   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
2944   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
2945   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
2946   __ decrementl(rdx);              // Decrement counter
2947   __ jcc(Assembler::notZero, loop);
2948   __ pushptr(Address(rcx, 0));     // Save final return address
2949 
2950   // Re-push self-frame
2951   __ enter();                 // Save old & set new rbp
2952   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2953                               // Prolog
2954 
2955   // Use rbp because the frames look interpreted now
2956   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2957   // Don't need the precise return PC here, just precise enough to point into this code blob.
2958   address the_pc = __ pc();
2959   __ set_last_Java_frame(noreg, rbp, the_pc);
2960 
2961   // Call C code.  Need thread but NOT official VM entry
2962   // crud.  We cannot block on this call, no GC can happen.  Call should
2963   // restore return values to their stack-slots with the new SP.
2964   // Thread is in rdi already.
2965   //
2966   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2967 
2968   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2969   __ mov(c_rarg0, r15_thread);
2970   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2971   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2972 
2973   // Set an oopmap for the call site
2974   // Use the same PC we used for the last java frame
2975   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2976 
2977   // Clear fp AND pc
2978   __ reset_last_Java_frame(true);
2979 
2980   // Pop self-frame.
2981   __ leave();                 // Epilog
2982 
2983   // Jump to interpreter
2984   __ ret(0);
2985 
2986   // Make sure all code is generated
2987   masm->flush();
2988 
2989   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
2990                                                  SimpleRuntimeFrame::framesize >> 1);
2991 }
2992 #endif // COMPILER2
2993 
2994 //------------------------------generate_handler_blob------
2995 //
2996 // Generate a special Compile2Runtime blob that saves all registers,
2997 // and setup oopmap.
2998 //
2999 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3000   assert(StubRoutines::forward_exception_entry() != NULL,
3001          "must be generated before");
3002 
3003   ResourceMark rm;
3004   OopMapSet *oop_maps = new OopMapSet();
3005   OopMap* map;
3006 
3007   // Allocate space for the code.  Setup code generation tools.
3008   CodeBuffer buffer("handler_blob", 2048, 1024);
3009   MacroAssembler* masm = new MacroAssembler(&buffer);
3010 
3011   address start   = __ pc();
3012   address call_pc = NULL;
3013   int frame_size_in_words;
3014   bool cause_return = (poll_type == POLL_AT_RETURN);
3015   bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3016 
3017   if (UseRTMLocking) {
3018     // Abort RTM transaction before calling runtime
3019     // because critical section will be large and will be
3020     // aborted anyway. Also nmethod could be deoptimized.
3021     __ xabort(0);
3022   }
3023 
3024   // Make room for return address (or push it again)
3025   if (!cause_return) {
3026     __ push(rbx);
3027   }
3028 
3029   // Save registers, fpu state, and flags
3030   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
3031 
3032   // The following is basically a call_VM.  However, we need the precise
3033   // address of the call in order to generate an oopmap. Hence, we do all the
3034   // work outselves.
3035 
3036   __ set_last_Java_frame(noreg, noreg, NULL);
3037 
3038   // The return address must always be correct so that frame constructor never
3039   // sees an invalid pc.
3040 
3041   if (!cause_return) {
3042     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3043     // Additionally, rbx is a callee saved register and we can look at it later to determine
3044     // if someone changed the return address for us!
3045     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3046     __ movptr(Address(rbp, wordSize), rbx);
3047   }
3048 
3049   // Do the call
3050   __ mov(c_rarg0, r15_thread);
3051   __ call(RuntimeAddress(call_ptr));
3052 
3053   // Set an oopmap for the call site.  This oopmap will map all
3054   // oop-registers and debug-info registers as callee-saved.  This
3055   // will allow deoptimization at this safepoint to find all possible
3056   // debug-info recordings, as well as let GC find all oops.
3057 
3058   oop_maps->add_gc_map( __ pc() - start, map);
3059 
3060   Label noException;
3061 
3062   __ reset_last_Java_frame(false);
3063 
3064   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3065   __ jcc(Assembler::equal, noException);
3066 
3067   // Exception pending
3068 
3069   RegisterSaver::restore_live_registers(masm, save_vectors);
3070 
3071   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3072 
3073   // No exception case
3074   __ bind(noException);
3075 
3076   Label no_adjust;
3077 #ifdef ASSERT
3078   Label bail;
3079 #endif
3080   if (!cause_return) {
3081     Label no_prefix, not_special;
3082 
3083     // If our stashed return pc was modified by the runtime we avoid touching it
3084     __ cmpptr(rbx, Address(rbp, wordSize));
3085     __ jccb(Assembler::notEqual, no_adjust);
3086 
3087     // Skip over the poll instruction.
3088     // See NativeInstruction::is_safepoint_poll()
3089     // Possible encodings:
3090     //      85 00       test   %eax,(%rax)
3091     //      85 01       test   %eax,(%rcx)
3092     //      85 02       test   %eax,(%rdx)
3093     //      85 03       test   %eax,(%rbx)
3094     //      85 06       test   %eax,(%rsi)
3095     //      85 07       test   %eax,(%rdi)
3096     //
3097     //   41 85 00       test   %eax,(%r8)
3098     //   41 85 01       test   %eax,(%r9)
3099     //   41 85 02       test   %eax,(%r10)
3100     //   41 85 03       test   %eax,(%r11)
3101     //   41 85 06       test   %eax,(%r14)
3102     //   41 85 07       test   %eax,(%r15)
3103     //
3104     //      85 04 24    test   %eax,(%rsp)
3105     //   41 85 04 24    test   %eax,(%r12)
3106     //      85 45 00    test   %eax,0x0(%rbp)
3107     //   41 85 45 00    test   %eax,0x0(%r13)
3108 
3109     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3110     __ jcc(Assembler::notEqual, no_prefix);
3111     __ addptr(rbx, 1);
3112     __ bind(no_prefix);
3113 #ifdef ASSERT
3114     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3115 #endif
3116     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3117     // r12/rsp 0x04
3118     // r13/rbp 0x05
3119     __ movzbq(rcx, Address(rbx, 1));
3120     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3121     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3122     __ cmpptr(rcx, 1);
3123     __ jcc(Assembler::above, not_special);
3124     __ addptr(rbx, 1);
3125     __ bind(not_special);
3126 #ifdef ASSERT
3127     // Verify the correct encoding of the poll we're about to skip.
3128     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3129     __ jcc(Assembler::notEqual, bail);
3130     // Mask out the modrm bits
3131     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3132     // rax encodes to 0, so if the bits are nonzero it's incorrect
3133     __ jcc(Assembler::notZero, bail);
3134 #endif
3135     // Adjust return pc forward to step over the safepoint poll instruction
3136     __ addptr(rbx, 2);
3137     __ movptr(Address(rbp, wordSize), rbx);
3138   }
3139 
3140   __ bind(no_adjust);
3141   // Normal exit, restore registers and exit.
3142   RegisterSaver::restore_live_registers(masm, save_vectors);
3143   __ ret(0);
3144 
3145 #ifdef ASSERT
3146   __ bind(bail);
3147   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3148 #endif
3149 
3150   // Make sure all code is generated
3151   masm->flush();
3152 
3153   // Fill-out other meta info
3154   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3155 }
3156 
3157 //
3158 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3159 //
3160 // Generate a stub that calls into vm to find out the proper destination
3161 // of a java call. All the argument registers are live at this point
3162 // but since this is generic code we don't know what they are and the caller
3163 // must do any gc of the args.
3164 //
3165 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3166   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3167 
3168   // allocate space for the code
3169   ResourceMark rm;
3170 
3171   CodeBuffer buffer(name, 1000, 512);
3172   MacroAssembler* masm                = new MacroAssembler(&buffer);
3173 
3174   int frame_size_in_words;
3175 
3176   OopMapSet *oop_maps = new OopMapSet();
3177   OopMap* map = NULL;
3178 
3179   int start = __ offset();
3180 
3181   // No need to save vector registers since they are caller-saved anyway.
3182   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
3183 
3184   int frame_complete = __ offset();
3185 
3186   __ set_last_Java_frame(noreg, noreg, NULL);
3187 
3188   __ mov(c_rarg0, r15_thread);
3189 
3190   __ call(RuntimeAddress(destination));
3191 
3192 
3193   // Set an oopmap for the call site.
3194   // We need this not only for callee-saved registers, but also for volatile
3195   // registers that the compiler might be keeping live across a safepoint.
3196 
3197   oop_maps->add_gc_map( __ offset() - start, map);
3198 
3199   // rax contains the address we are going to jump to assuming no exception got installed
3200 
3201   // clear last_Java_sp
3202   __ reset_last_Java_frame(false);
3203   // check for pending exceptions
3204   Label pending;
3205   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3206   __ jcc(Assembler::notEqual, pending);
3207 
3208   // get the returned Method*
3209   __ get_vm_result_2(rbx, r15_thread);
3210   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3211 
3212   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3213 
3214   RegisterSaver::restore_live_registers(masm);
3215 
3216   // We are back the the original state on entry and ready to go.
3217 
3218   __ jmp(rax);
3219 
3220   // Pending exception after the safepoint
3221 
3222   __ bind(pending);
3223 
3224   RegisterSaver::restore_live_registers(masm);
3225 
3226   // exception pending => remove activation and forward to exception handler
3227 
3228   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3229 
3230   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3231   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3232 
3233   // -------------
3234   // make sure all code is generated
3235   masm->flush();
3236 
3237   // return the  blob
3238   // frame_size_words or bytes??
3239   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3240 }
3241 
3242 #ifdef COMPILER2
3243 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3244 
3245 class NativeInvokerGenerator : public StubCodeGenerator {
3246   address _call_target;
3247   int _shadow_space_bytes;
3248 
3249   const GrowableArray<VMReg>& _input_registers;
3250   const GrowableArray<VMReg>& _output_registers;
3251 
3252   int _frame_complete;
3253   int _framesize;
3254   OopMapSet* _oop_maps;
3255 public:
3256   NativeInvokerGenerator(CodeBuffer* buffer,
3257                          address call_target,
3258                          int shadow_space_bytes,
3259                          const GrowableArray<VMReg>& input_registers,
3260                          const GrowableArray<VMReg>& output_registers)
3261    : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3262      _call_target(call_target),
3263      _shadow_space_bytes(shadow_space_bytes),
3264      _input_registers(input_registers),
3265      _output_registers(output_registers),
3266      _frame_complete(0),
3267      _framesize(0),
3268      _oop_maps(NULL) {
3269     assert(_output_registers.length() <= 1
3270            || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3271 
3272   }
3273 
3274   void generate();
3275 
3276   int spill_size_in_bytes() const {
3277     if (_output_registers.length() == 0) {
3278       return 0;
3279     }
3280     VMReg reg = _output_registers.at(0);
3281     assert(reg->is_reg(), "must be a register");
3282     if (reg->is_Register()) {
3283       return 8;
3284     } else if (reg->is_XMMRegister()) {
3285       if (UseAVX >= 3) {
3286         return 64;
3287       } else if (UseAVX >= 1) {
3288         return 32;
3289       } else {
3290         return 16;
3291       }
3292     } else {
3293       ShouldNotReachHere();
3294     }
3295     return 0;
3296   }
3297 
3298   void spill_out_registers() {
3299     if (_output_registers.length() == 0) {
3300       return;
3301     }
3302     VMReg reg = _output_registers.at(0);
3303     assert(reg->is_reg(), "must be a register");
3304     MacroAssembler* masm = _masm;
3305     if (reg->is_Register()) {
3306       __ movptr(Address(rsp, 0), reg->as_Register());
3307     } else if (reg->is_XMMRegister()) {
3308       if (UseAVX >= 3) {
3309         __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3310       } else if (UseAVX >= 1) {
3311         __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3312       } else {
3313         __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3314       }
3315     } else {
3316       ShouldNotReachHere();
3317     }
3318   }
3319 
3320   void fill_out_registers() {
3321     if (_output_registers.length() == 0) {
3322       return;
3323     }
3324     VMReg reg = _output_registers.at(0);
3325     assert(reg->is_reg(), "must be a register");
3326     MacroAssembler* masm = _masm;
3327     if (reg->is_Register()) {
3328       __ movptr(reg->as_Register(), Address(rsp, 0));
3329     } else if (reg->is_XMMRegister()) {
3330       if (UseAVX >= 3) {
3331         __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3332       } else if (UseAVX >= 1) {
3333         __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3334       } else {
3335         __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3336       }
3337     } else {
3338       ShouldNotReachHere();
3339     }
3340   }
3341 
3342   int frame_complete() const {
3343     return _frame_complete;
3344   }
3345 
3346   int framesize() const {
3347     return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3348   }
3349 
3350   OopMapSet* oop_maps() const {
3351     return _oop_maps;
3352   }
3353 
3354 private:
3355 #ifdef ASSERT
3356 bool target_uses_register(VMReg reg) {
3357   return _input_registers.contains(reg) || _output_registers.contains(reg);
3358 }
3359 #endif
3360 };
3361 
3362 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3363                                                 int shadow_space_bytes,
3364                                                 const GrowableArray<VMReg>& input_registers,
3365                                                 const GrowableArray<VMReg>& output_registers) {
3366   int locs_size  = 64;
3367   CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3368   NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3369   g.generate();
3370   code.log_section_sizes("nep_invoker_blob");
3371 
3372   RuntimeStub* stub =
3373     RuntimeStub::new_runtime_stub("nep_invoker_blob",
3374                                   &code,
3375                                   g.frame_complete(),
3376                                   g.framesize(),
3377                                   g.oop_maps(), false);
3378   return stub;
3379 }
3380 
3381 void NativeInvokerGenerator::generate() {
3382   assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3383 
3384   enum layout {
3385     rbp_off,
3386     rbp_off2,
3387     return_off,
3388     return_off2,
3389     framesize // inclusive of return address
3390   };
3391 
3392   _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3393   assert(is_even(_framesize/2), "sp not 16-byte aligned");
3394 
3395   _oop_maps  = new OopMapSet();
3396   MacroAssembler* masm = _masm;
3397 
3398   address start = __ pc();
3399 
3400   __ enter();
3401 
3402   // return address and rbp are already in place
3403   __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3404 
3405   _frame_complete = __ pc() - start;
3406 
3407   address the_pc = __ pc();
3408 
3409   __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3410   OopMap* map = new OopMap(_framesize, 0);
3411   _oop_maps->add_gc_map(the_pc - start, map);
3412 
3413   // State transition
3414   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3415 
3416   __ call(RuntimeAddress(_call_target));
3417 
3418   __ restore_cpu_control_state_after_jni();
3419 
3420   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3421 
3422   // Force this write out before the read below
3423   __ membar(Assembler::Membar_mask_bits(
3424           Assembler::LoadLoad | Assembler::LoadStore |
3425           Assembler::StoreLoad | Assembler::StoreStore));
3426 
3427   Label L_after_safepoint_poll;
3428   Label L_safepoint_poll_slow_path;
3429 
3430   __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3431   __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3432   __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3433 
3434   __ bind(L_after_safepoint_poll);
3435 
3436   // change thread state
3437   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3438 
3439   __ block_comment("reguard stack check");
3440   Label L_reguard;
3441   Label L_after_reguard;
3442   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3443   __ jcc(Assembler::equal, L_reguard);
3444   __ bind(L_after_reguard);
3445 
3446   __ reset_last_Java_frame(r15_thread, true);
3447 
3448   __ leave(); // required for proper stackwalking of RuntimeStub frame
3449   __ ret(0);
3450 
3451   //////////////////////////////////////////////////////////////////////////////
3452 
3453   __ block_comment("{ L_safepoint_poll_slow_path");
3454   __ bind(L_safepoint_poll_slow_path);
3455   __ vzeroupper();
3456 
3457   spill_out_registers();
3458 
3459   __ mov(c_rarg0, r15_thread);
3460   __ mov(r12, rsp); // remember sp
3461   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3462   __ andptr(rsp, -16); // align stack as required by ABI
3463   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3464   __ mov(rsp, r12); // restore sp
3465   __ reinit_heapbase();
3466 
3467   fill_out_registers();
3468 
3469   __ jmp(L_after_safepoint_poll);
3470   __ block_comment("} L_safepoint_poll_slow_path");
3471 
3472   //////////////////////////////////////////////////////////////////////////////
3473 
3474   __ block_comment("{ L_reguard");
3475   __ bind(L_reguard);
3476   __ vzeroupper();
3477 
3478   spill_out_registers();
3479 
3480   __ mov(r12, rsp); // remember sp
3481   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3482   __ andptr(rsp, -16); // align stack as required by ABI
3483   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3484   __ mov(rsp, r12); // restore sp
3485   __ reinit_heapbase();
3486 
3487   fill_out_registers();
3488 
3489   __ jmp(L_after_reguard);
3490 
3491   __ block_comment("} L_reguard");
3492 
3493   //////////////////////////////////////////////////////////////////////////////
3494 
3495   __ flush();
3496 }
3497 #endif // COMPILER2
3498 
3499 //------------------------------Montgomery multiplication------------------------
3500 //
3501 
3502 #ifndef _WINDOWS
3503 
3504 // Subtract 0:b from carry:a.  Return carry.
3505 static julong
3506 sub(julong a[], julong b[], julong carry, long len) {
3507   long long i = 0, cnt = len;
3508   julong tmp;
3509   asm volatile("clc; "
3510                "0: ; "
3511                "mov (%[b], %[i], 8), %[tmp]; "
3512                "sbb %[tmp], (%[a], %[i], 8); "
3513                "inc %[i]; dec %[cnt]; "
3514                "jne 0b; "
3515                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3516                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3517                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3518                : "memory");
3519   return tmp;
3520 }
3521 
3522 // Multiply (unsigned) Long A by Long B, accumulating the double-
3523 // length result into the accumulator formed of T0, T1, and T2.
3524 #define MACC(A, B, T0, T1, T2)                                  \
3525 do {                                                            \
3526   unsigned long hi, lo;                                         \
3527   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3528            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3529            : "r"(A), "a"(B) : "cc");                            \
3530  } while(0)
3531 
3532 // As above, but add twice the double-length result into the
3533 // accumulator.
3534 #define MACC2(A, B, T0, T1, T2)                                 \
3535 do {                                                            \
3536   unsigned long hi, lo;                                         \
3537   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3538            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3539            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3540            : "r"(A), "a"(B) : "cc");                            \
3541  } while(0)
3542 
3543 #else //_WINDOWS
3544 
3545 static julong
3546 sub(julong a[], julong b[], julong carry, long len) {
3547   long i;
3548   julong tmp;
3549   unsigned char c = 1;
3550   for (i = 0; i < len; i++) {
3551     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3552     a[i] = tmp;
3553   }
3554   c = _addcarry_u64(c, carry, ~0, &tmp);
3555   return tmp;
3556 }
3557 
3558 // Multiply (unsigned) Long A by Long B, accumulating the double-
3559 // length result into the accumulator formed of T0, T1, and T2.
3560 #define MACC(A, B, T0, T1, T2)                          \
3561 do {                                                    \
3562   julong hi, lo;                            \
3563   lo = _umul128(A, B, &hi);                             \
3564   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3565   c = _addcarry_u64(c, hi, T1, &T1);                    \
3566   _addcarry_u64(c, T2, 0, &T2);                         \
3567  } while(0)
3568 
3569 // As above, but add twice the double-length result into the
3570 // accumulator.
3571 #define MACC2(A, B, T0, T1, T2)                         \
3572 do {                                                    \
3573   julong hi, lo;                            \
3574   lo = _umul128(A, B, &hi);                             \
3575   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3576   c = _addcarry_u64(c, hi, T1, &T1);                    \
3577   _addcarry_u64(c, T2, 0, &T2);                         \
3578   c = _addcarry_u64(0, lo, T0, &T0);                    \
3579   c = _addcarry_u64(c, hi, T1, &T1);                    \
3580   _addcarry_u64(c, T2, 0, &T2);                         \
3581  } while(0)
3582 
3583 #endif //_WINDOWS
3584 
3585 // Fast Montgomery multiplication.  The derivation of the algorithm is
3586 // in  A Cryptographic Library for the Motorola DSP56000,
3587 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3588 
3589 static void NOINLINE
3590 montgomery_multiply(julong a[], julong b[], julong n[],
3591                     julong m[], julong inv, int len) {
3592   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3593   int i;
3594 
3595   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3596 
3597   for (i = 0; i < len; i++) {
3598     int j;
3599     for (j = 0; j < i; j++) {
3600       MACC(a[j], b[i-j], t0, t1, t2);
3601       MACC(m[j], n[i-j], t0, t1, t2);
3602     }
3603     MACC(a[i], b[0], t0, t1, t2);
3604     m[i] = t0 * inv;
3605     MACC(m[i], n[0], t0, t1, t2);
3606 
3607     assert(t0 == 0, "broken Montgomery multiply");
3608 
3609     t0 = t1; t1 = t2; t2 = 0;
3610   }
3611 
3612   for (i = len; i < 2*len; i++) {
3613     int j;
3614     for (j = i-len+1; j < len; j++) {
3615       MACC(a[j], b[i-j], t0, t1, t2);
3616       MACC(m[j], n[i-j], t0, t1, t2);
3617     }
3618     m[i-len] = t0;
3619     t0 = t1; t1 = t2; t2 = 0;
3620   }
3621 
3622   while (t0)
3623     t0 = sub(m, n, t0, len);
3624 }
3625 
3626 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3627 // multiplies so it should be up to 25% faster than Montgomery
3628 // multiplication.  However, its loop control is more complex and it
3629 // may actually run slower on some machines.
3630 
3631 static void NOINLINE
3632 montgomery_square(julong a[], julong n[],
3633                   julong m[], julong inv, int len) {
3634   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3635   int i;
3636 
3637   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3638 
3639   for (i = 0; i < len; i++) {
3640     int j;
3641     int end = (i+1)/2;
3642     for (j = 0; j < end; j++) {
3643       MACC2(a[j], a[i-j], t0, t1, t2);
3644       MACC(m[j], n[i-j], t0, t1, t2);
3645     }
3646     if ((i & 1) == 0) {
3647       MACC(a[j], a[j], t0, t1, t2);
3648     }
3649     for (; j < i; j++) {
3650       MACC(m[j], n[i-j], t0, t1, t2);
3651     }
3652     m[i] = t0 * inv;
3653     MACC(m[i], n[0], t0, t1, t2);
3654 
3655     assert(t0 == 0, "broken Montgomery square");
3656 
3657     t0 = t1; t1 = t2; t2 = 0;
3658   }
3659 
3660   for (i = len; i < 2*len; i++) {
3661     int start = i-len+1;
3662     int end = start + (len - start)/2;
3663     int j;
3664     for (j = start; j < end; j++) {
3665       MACC2(a[j], a[i-j], t0, t1, t2);
3666       MACC(m[j], n[i-j], t0, t1, t2);
3667     }
3668     if ((i & 1) == 0) {
3669       MACC(a[j], a[j], t0, t1, t2);
3670     }
3671     for (; j < len; j++) {
3672       MACC(m[j], n[i-j], t0, t1, t2);
3673     }
3674     m[i-len] = t0;
3675     t0 = t1; t1 = t2; t2 = 0;
3676   }
3677 
3678   while (t0)
3679     t0 = sub(m, n, t0, len);
3680 }
3681 
3682 // Swap words in a longword.
3683 static julong swap(julong x) {
3684   return (x << 32) | (x >> 32);
3685 }
3686 
3687 // Copy len longwords from s to d, word-swapping as we go.  The
3688 // destination array is reversed.
3689 static void reverse_words(julong *s, julong *d, int len) {
3690   d += len;
3691   while(len-- > 0) {
3692     d--;
3693     *d = swap(*s);
3694     s++;
3695   }
3696 }
3697 
3698 // The threshold at which squaring is advantageous was determined
3699 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3700 #define MONTGOMERY_SQUARING_THRESHOLD 64
3701 
3702 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3703                                         jint len, jlong inv,
3704                                         jint *m_ints) {
3705   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3706   int longwords = len/2;
3707 
3708   // Make very sure we don't use so much space that the stack might
3709   // overflow.  512 jints corresponds to an 16384-bit integer and
3710   // will use here a total of 8k bytes of stack space.
3711   int total_allocation = longwords * sizeof (julong) * 4;
3712   guarantee(total_allocation <= 8192, "must be");
3713   julong *scratch = (julong *)alloca(total_allocation);
3714 
3715   // Local scratch arrays
3716   julong
3717     *a = scratch + 0 * longwords,
3718     *b = scratch + 1 * longwords,
3719     *n = scratch + 2 * longwords,
3720     *m = scratch + 3 * longwords;
3721 
3722   reverse_words((julong *)a_ints, a, longwords);
3723   reverse_words((julong *)b_ints, b, longwords);
3724   reverse_words((julong *)n_ints, n, longwords);
3725 
3726   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3727 
3728   reverse_words(m, (julong *)m_ints, longwords);
3729 }
3730 
3731 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3732                                       jint len, jlong inv,
3733                                       jint *m_ints) {
3734   assert(len % 2 == 0, "array length in montgomery_square must be even");
3735   int longwords = len/2;
3736 
3737   // Make very sure we don't use so much space that the stack might
3738   // overflow.  512 jints corresponds to an 16384-bit integer and
3739   // will use here a total of 6k bytes of stack space.
3740   int total_allocation = longwords * sizeof (julong) * 3;
3741   guarantee(total_allocation <= 8192, "must be");
3742   julong *scratch = (julong *)alloca(total_allocation);
3743 
3744   // Local scratch arrays
3745   julong
3746     *a = scratch + 0 * longwords,
3747     *n = scratch + 1 * longwords,
3748     *m = scratch + 2 * longwords;
3749 
3750   reverse_words((julong *)a_ints, a, longwords);
3751   reverse_words((julong *)n_ints, n, longwords);
3752 
3753   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3754     ::montgomery_square(a, n, m, (julong)inv, longwords);
3755   } else {
3756     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3757   }
3758 
3759   reverse_words(m, (julong *)m_ints, longwords);
3760 }
3761 
3762 #ifdef COMPILER2
3763 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3764 //
3765 //------------------------------generate_exception_blob---------------------------
3766 // creates exception blob at the end
3767 // Using exception blob, this code is jumped from a compiled method.
3768 // (see emit_exception_handler in x86_64.ad file)
3769 //
3770 // Given an exception pc at a call we call into the runtime for the
3771 // handler in this method. This handler might merely restore state
3772 // (i.e. callee save registers) unwind the frame and jump to the
3773 // exception handler for the nmethod if there is no Java level handler
3774 // for the nmethod.
3775 //
3776 // This code is entered with a jmp.
3777 //
3778 // Arguments:
3779 //   rax: exception oop
3780 //   rdx: exception pc
3781 //
3782 // Results:
3783 //   rax: exception oop
3784 //   rdx: exception pc in caller or ???
3785 //   destination: exception handler of caller
3786 //
3787 // Note: the exception pc MUST be at a call (precise debug information)
3788 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3789 //
3790 
3791 void OptoRuntime::generate_exception_blob() {
3792   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3793   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3794   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3795 
3796   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3797 
3798   // Allocate space for the code
3799   ResourceMark rm;
3800   // Setup code generation tools
3801   CodeBuffer buffer("exception_blob", 2048, 1024);
3802   MacroAssembler* masm = new MacroAssembler(&buffer);
3803 
3804 
3805   address start = __ pc();
3806 
3807   // Exception pc is 'return address' for stack walker
3808   __ push(rdx);
3809   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3810 
3811   // Save callee-saved registers.  See x86_64.ad.
3812 
3813   // rbp is an implicitly saved callee saved register (i.e., the calling
3814   // convention will save/restore it in the prolog/epilog). Other than that
3815   // there are no callee save registers now that adapter frames are gone.
3816 
3817   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3818 
3819   // Store exception in Thread object. We cannot pass any arguments to the
3820   // handle_exception call, since we do not want to make any assumption
3821   // about the size of the frame where the exception happened in.
3822   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3823   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3824   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3825 
3826   // This call does all the hard work.  It checks if an exception handler
3827   // exists in the method.
3828   // If so, it returns the handler address.
3829   // If not, it prepares for stack-unwinding, restoring the callee-save
3830   // registers of the frame being removed.
3831   //
3832   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3833 
3834   // At a method handle call, the stack may not be properly aligned
3835   // when returning with an exception.
3836   address the_pc = __ pc();
3837   __ set_last_Java_frame(noreg, noreg, the_pc);
3838   __ mov(c_rarg0, r15_thread);
3839   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3840   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3841 
3842   // Set an oopmap for the call site.  This oopmap will only be used if we
3843   // are unwinding the stack.  Hence, all locations will be dead.
3844   // Callee-saved registers will be the same as the frame above (i.e.,
3845   // handle_exception_stub), since they were restored when we got the
3846   // exception.
3847 
3848   OopMapSet* oop_maps = new OopMapSet();
3849 
3850   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3851 
3852   __ reset_last_Java_frame(false);
3853 
3854   // Restore callee-saved registers
3855 
3856   // rbp is an implicitly saved callee-saved register (i.e., the calling
3857   // convention will save restore it in prolog/epilog) Other than that
3858   // there are no callee save registers now that adapter frames are gone.
3859 
3860   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3861 
3862   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3863   __ pop(rdx);                  // No need for exception pc anymore
3864 
3865   // rax: exception handler
3866 
3867   // We have a handler in rax (could be deopt blob).
3868   __ mov(r8, rax);
3869 
3870   // Get the exception oop
3871   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3872   // Get the exception pc in case we are deoptimized
3873   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3874 #ifdef ASSERT
3875   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3876   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3877 #endif
3878   // Clear the exception oop so GC no longer processes it as a root.
3879   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3880 
3881   // rax: exception oop
3882   // r8:  exception handler
3883   // rdx: exception pc
3884   // Jump to handler
3885 
3886   __ jmp(r8);
3887 
3888   // Make sure all code is generated
3889   masm->flush();
3890 
3891   // Set exception blob
3892   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3893 }
3894 #endif // COMPILER2
3895 
3896 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
3897                                        int total_in_args, const VMRegPair* in_regs,
3898                                        int total_out_args, VMRegPair* out_regs,
3899                                        GrowableArray<int>& arg_order,
3900                                        VMRegPair tmp_vmreg) {
3901   ComputeMoveOrder order(total_in_args, in_regs,
3902                          total_out_args, out_regs,
3903                          in_sig_bt, arg_order, tmp_vmreg);
3904 }