1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/icBuffer.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/compiledICHolder.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/jniHandles.hpp"
  48 #include "runtime/safepointMechanism.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/signature.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "runtime/vframeArray.hpp"
  53 #include "runtime/vm_version.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/formatBuffer.hpp"
  56 #include "vmreg_x86.inline.hpp"
  57 #ifdef COMPILER1
  58 #include "c1/c1_Runtime1.hpp"
  59 #endif
  60 #ifdef COMPILER2
  61 #include "opto/runtime.hpp"
  62 #endif
  63 #if INCLUDE_JVMCI
  64 #include "jvmci/jvmciJavaClasses.hpp"
  65 #endif
  66 
  67 #define __ masm->
  68 
  69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  70 
  71 class SimpleRuntimeFrame {
  72 
  73   public:
  74 
  75   // Most of the runtime stubs have this simple frame layout.
  76   // This class exists to make the layout shared in one place.
  77   // Offsets are for compiler stack slots, which are jints.
  78   enum layout {
  79     // The frame sender code expects that rbp will be in the "natural" place and
  80     // will override any oopMap setting for it. We must therefore force the layout
  81     // so that it agrees with the frame sender code.
  82     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  83     rbp_off2,
  84     return_off, return_off2,
  85     framesize
  86   };
  87 };
  88 
  89 class RegisterSaver {
  90   // Capture info about frame layout.  Layout offsets are in jint
  91   // units because compiler frame slots are jints.
  92 #define XSAVE_AREA_BEGIN 160
  93 #define XSAVE_AREA_YMM_BEGIN 576
  94 #define XSAVE_AREA_OPMASK_BEGIN 1088
  95 #define XSAVE_AREA_ZMM_BEGIN 1152
  96 #define XSAVE_AREA_UPPERBANK 1664
  97 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  98 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  99 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 100 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 102   enum layout {
 103     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 104     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 105     DEF_XMM_OFFS(0),
 106     DEF_XMM_OFFS(1),
 107     // 2..15 are implied in range usage
 108     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 109     DEF_YMM_OFFS(0),
 110     DEF_YMM_OFFS(1),
 111     // 2..15 are implied in range usage
 112     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 113     DEF_OPMASK_OFFS(0),
 114     DEF_OPMASK_OFFS(1),
 115     // 2..7 are implied in range usage
 116     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 117     DEF_ZMM_OFFS(0),
 118     DEF_ZMM_OFFS(1),
 119     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 120     DEF_ZMM_UPPER_OFFS(16),
 121     DEF_ZMM_UPPER_OFFS(17),
 122     // 18..31 are implied in range usage
 123     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 124     fpu_stateH_end,
 125     r15_off, r15H_off,
 126     r14_off, r14H_off,
 127     r13_off, r13H_off,
 128     r12_off, r12H_off,
 129     r11_off, r11H_off,
 130     r10_off, r10H_off,
 131     r9_off,  r9H_off,
 132     r8_off,  r8H_off,
 133     rdi_off, rdiH_off,
 134     rsi_off, rsiH_off,
 135     ignore_off, ignoreH_off,  // extra copy of rbp
 136     rsp_off, rspH_off,
 137     rbx_off, rbxH_off,
 138     rdx_off, rdxH_off,
 139     rcx_off, rcxH_off,
 140     rax_off, raxH_off,
 141     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 142     align_off, alignH_off,
 143     flags_off, flagsH_off,
 144     // The frame sender code expects that rbp will be in the "natural" place and
 145     // will override any oopMap setting for it. We must therefore force the layout
 146     // so that it agrees with the frame sender code.
 147     rbp_off, rbpH_off,        // copy of rbp we will restore
 148     return_off, returnH_off,  // slot for return address
 149     reg_save_size             // size in compiler stack slots
 150   };
 151 
 152  public:
 153   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 154   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 155 
 156   // Offsets into the register save area
 157   // Used by deoptimization when it is managing result register
 158   // values on its own
 159 
 160   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 161   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 162   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 163   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 164   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 165 
 166   // During deoptimization only the result registers need to be restored,
 167   // all the other values have already been extracted.
 168   static void restore_result_registers(MacroAssembler* masm);
 169 };
 170 
 171 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 172   int off = 0;
 173   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 174   if (UseAVX < 3) {
 175     num_xmm_regs = num_xmm_regs/2;
 176   }
 177 #if COMPILER2_OR_JVMCI
 178   if (save_wide_vectors && UseAVX == 0) {
 179     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 180   }
 181   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 182 #else
 183   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 184 #endif
 185 
 186   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 187   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 188   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 189   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 190   // CodeBlob frame size is in words.
 191   int frame_size_in_words = frame_size_in_bytes / wordSize;
 192   *total_frame_words = frame_size_in_words;
 193 
 194   // Save registers, fpu state, and flags.
 195   // We assume caller has already pushed the return address onto the
 196   // stack, so rsp is 8-byte aligned here.
 197   // We push rpb twice in this sequence because we want the real rbp
 198   // to be under the return like a normal enter.
 199 
 200   __ enter();          // rsp becomes 16-byte aligned here
 201   __ push_CPU_state(); // Push a multiple of 16 bytes
 202 
 203   // push cpu state handles this on EVEX enabled targets
 204   if (save_wide_vectors) {
 205     // Save upper half of YMM registers(0..15)
 206     int base_addr = XSAVE_AREA_YMM_BEGIN;
 207     for (int n = 0; n < 16; n++) {
 208       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 209     }
 210     if (VM_Version::supports_evex()) {
 211       // Save upper half of ZMM registers(0..15)
 212       base_addr = XSAVE_AREA_ZMM_BEGIN;
 213       for (int n = 0; n < 16; n++) {
 214         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 215       }
 216       // Save full ZMM registers(16..num_xmm_regs)
 217       base_addr = XSAVE_AREA_UPPERBANK;
 218       off = 0;
 219       int vector_len = Assembler::AVX_512bit;
 220       for (int n = 16; n < num_xmm_regs; n++) {
 221         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 222       }
 223 #if COMPILER2_OR_JVMCI
 224       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 225       off = 0;
 226       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 227         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 228       }
 229 #endif
 230     }
 231   } else {
 232     if (VM_Version::supports_evex()) {
 233       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 234       int base_addr = XSAVE_AREA_UPPERBANK;
 235       off = 0;
 236       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 237       for (int n = 16; n < num_xmm_regs; n++) {
 238         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 239       }
 240 #if COMPILER2_OR_JVMCI
 241       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 242       off = 0;
 243       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 244         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 245       }
 246 #endif
 247     }
 248   }
 249   __ vzeroupper();
 250   if (frame::arg_reg_save_area_bytes != 0) {
 251     // Allocate argument register save area
 252     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 253   }
 254 
 255   // Set an oopmap for the call site.  This oopmap will map all
 256   // oop-registers and debug-info registers as callee-saved.  This
 257   // will allow deoptimization at this safepoint to find all possible
 258   // debug-info recordings, as well as let GC find all oops.
 259 
 260   OopMapSet *oop_maps = new OopMapSet();
 261   OopMap* map = new OopMap(frame_size_in_slots, 0);
 262 
 263 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 264 
 265   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 266   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 267   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 269   // rbp location is known implicitly by the frame sender code, needs no oopmap
 270   // and the location where rbp was saved by is ignored
 271   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 272   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 273   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 281   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 282   // on EVEX enabled targets, we get it included in the xsave area
 283   off = xmm0_off;
 284   int delta = xmm1_off - off;
 285   for (int n = 0; n < 16; n++) {
 286     XMMRegister xmm_name = as_XMMRegister(n);
 287     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 288     off += delta;
 289   }
 290   if (UseAVX > 2) {
 291     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 292     off = zmm16_off;
 293     delta = zmm17_off - off;
 294     for (int n = 16; n < num_xmm_regs; n++) {
 295       XMMRegister zmm_name = as_XMMRegister(n);
 296       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 297       off += delta;
 298     }
 299   }
 300 
 301 #if COMPILER2_OR_JVMCI
 302   if (save_wide_vectors) {
 303     // Save upper half of YMM registers(0..15)
 304     off = ymm0_off;
 305     delta = ymm1_off - ymm0_off;
 306     for (int n = 0; n < 16; n++) {
 307       XMMRegister ymm_name = as_XMMRegister(n);
 308       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 309       off += delta;
 310     }
 311     if (VM_Version::supports_evex()) {
 312       // Save upper half of ZMM registers(0..15)
 313       off = zmm0_off;
 314       delta = zmm1_off - zmm0_off;
 315       for (int n = 0; n < 16; n++) {
 316         XMMRegister zmm_name = as_XMMRegister(n);
 317         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 318         off += delta;
 319       }
 320     }
 321   }
 322 #endif // COMPILER2_OR_JVMCI
 323 
 324   // %%% These should all be a waste but we'll keep things as they were for now
 325   if (true) {
 326     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 327     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 328     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 330     // rbp location is known implicitly by the frame sender code, needs no oopmap
 331     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 341     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 342     // on EVEX enabled targets, we get it included in the xsave area
 343     off = xmm0H_off;
 344     delta = xmm1H_off - off;
 345     for (int n = 0; n < 16; n++) {
 346       XMMRegister xmm_name = as_XMMRegister(n);
 347       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 348       off += delta;
 349     }
 350     if (UseAVX > 2) {
 351       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 352       off = zmm16H_off;
 353       delta = zmm17H_off - off;
 354       for (int n = 16; n < num_xmm_regs; n++) {
 355         XMMRegister zmm_name = as_XMMRegister(n);
 356         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 357         off += delta;
 358       }
 359     }
 360   }
 361 
 362   return map;
 363 }
 364 
 365 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 366   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 367   if (UseAVX < 3) {
 368     num_xmm_regs = num_xmm_regs/2;
 369   }
 370   if (frame::arg_reg_save_area_bytes != 0) {
 371     // Pop arg register save area
 372     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 373   }
 374 
 375 #if COMPILER2_OR_JVMCI
 376   if (restore_wide_vectors) {
 377     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 378     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 379   }
 380 #else
 381   assert(!restore_wide_vectors, "vectors are generated only by C2");
 382 #endif
 383 
 384   __ vzeroupper();
 385 
 386   // On EVEX enabled targets everything is handled in pop fpu state
 387   if (restore_wide_vectors) {
 388     // Restore upper half of YMM registers (0..15)
 389     int base_addr = XSAVE_AREA_YMM_BEGIN;
 390     for (int n = 0; n < 16; n++) {
 391       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 392     }
 393     if (VM_Version::supports_evex()) {
 394       // Restore upper half of ZMM registers (0..15)
 395       base_addr = XSAVE_AREA_ZMM_BEGIN;
 396       for (int n = 0; n < 16; n++) {
 397         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 398       }
 399       // Restore full ZMM registers(16..num_xmm_regs)
 400       base_addr = XSAVE_AREA_UPPERBANK;
 401       int vector_len = Assembler::AVX_512bit;
 402       int off = 0;
 403       for (int n = 16; n < num_xmm_regs; n++) {
 404         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 405       }
 406 #if COMPILER2_OR_JVMCI
 407       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 408       off = 0;
 409       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 410         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 411       }
 412 #endif
 413     }
 414   } else {
 415     if (VM_Version::supports_evex()) {
 416       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 417       int base_addr = XSAVE_AREA_UPPERBANK;
 418       int off = 0;
 419       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 420       for (int n = 16; n < num_xmm_regs; n++) {
 421         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 422       }
 423 #if COMPILER2_OR_JVMCI
 424       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 425       off = 0;
 426       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 427         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 428       }
 429 #endif
 430     }
 431   }
 432 
 433   // Recover CPU state
 434   __ pop_CPU_state();
 435   // Get the rbp described implicitly by the calling convention (no oopMap)
 436   __ pop(rbp);
 437 }
 438 
 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 440 
 441   // Just restore result register. Only used by deoptimization. By
 442   // now any callee save register that needs to be restored to a c2
 443   // caller of the deoptee has been extracted into the vframeArray
 444   // and will be stuffed into the c2i adapter we create for later
 445   // restoration so only result registers need to be restored here.
 446 
 447   // Restore fp result register
 448   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 449   // Restore integer result register
 450   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 451   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 452 
 453   // Pop all of the register save are off the stack except the return address
 454   __ addptr(rsp, return_offset_in_bytes());
 455 }
 456 
 457 // Is vector's size (in bytes) bigger than a size saved by default?
 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 459 bool SharedRuntime::is_wide_vector(int size) {
 460   return size > 16;
 461 }
 462 
 463 // ---------------------------------------------------------------------------
 464 // Read the array of BasicTypes from a signature, and compute where the
 465 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 466 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 467 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 468 // as framesizes are fixed.
 469 // VMRegImpl::stack0 refers to the first slot 0(sp).
 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 471 // up to RegisterImpl::number_of_registers) are the 64-bit
 472 // integer registers.
 473 
 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 475 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 476 // units regardless of build. Of course for i486 there is no 64 bit build
 477 
 478 // The Java calling convention is a "shifted" version of the C ABI.
 479 // By skipping the first C ABI register we can call non-static jni methods
 480 // with small numbers of arguments without having to shuffle the arguments
 481 // at all. Since we control the java ABI we ought to at least get some
 482 // advantage out of it.
 483 
 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 485                                            VMRegPair *regs,
 486                                            int total_args_passed) {
 487 
 488   // Create the mapping between argument positions and
 489   // registers.
 490   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 491     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 492   };
 493   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 494     j_farg0, j_farg1, j_farg2, j_farg3,
 495     j_farg4, j_farg5, j_farg6, j_farg7
 496   };
 497 
 498 
 499   uint int_args = 0;
 500   uint fp_args = 0;
 501   uint stk_args = 0; // inc by 2 each time
 502 
 503   for (int i = 0; i < total_args_passed; i++) {
 504     switch (sig_bt[i]) {
 505     case T_BOOLEAN:
 506     case T_CHAR:
 507     case T_BYTE:
 508     case T_SHORT:
 509     case T_INT:
 510       if (int_args < Argument::n_int_register_parameters_j) {
 511         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 512       } else {
 513         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 514         stk_args += 2;
 515       }
 516       break;
 517     case T_VOID:
 518       // halves of T_LONG or T_DOUBLE
 519       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 520       regs[i].set_bad();
 521       break;
 522     case T_LONG:
 523       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 524       // fall through
 525     case T_OBJECT:
 526     case T_ARRAY:
 527     case T_ADDRESS:
 528       if (int_args < Argument::n_int_register_parameters_j) {
 529         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 530       } else {
 531         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 532         stk_args += 2;
 533       }
 534       break;
 535     case T_FLOAT:
 536       if (fp_args < Argument::n_float_register_parameters_j) {
 537         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 538       } else {
 539         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 540         stk_args += 2;
 541       }
 542       break;
 543     case T_DOUBLE:
 544       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 545       if (fp_args < Argument::n_float_register_parameters_j) {
 546         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 547       } else {
 548         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 549         stk_args += 2;
 550       }
 551       break;
 552     default:
 553       ShouldNotReachHere();
 554       break;
 555     }
 556   }
 557 
 558   return align_up(stk_args, 2);
 559 }
 560 
 561 // Patch the callers callsite with entry to compiled code if it exists.
 562 static void patch_callers_callsite(MacroAssembler *masm) {
 563   Label L;
 564   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 565   __ jcc(Assembler::equal, L);
 566 
 567   // Save the current stack pointer
 568   __ mov(r13, rsp);
 569   // Schedule the branch target address early.
 570   // Call into the VM to patch the caller, then jump to compiled callee
 571   // rax isn't live so capture return address while we easily can
 572   __ movptr(rax, Address(rsp, 0));
 573 
 574   // align stack so push_CPU_state doesn't fault
 575   __ andptr(rsp, -(StackAlignmentInBytes));
 576   __ push_CPU_state();
 577   __ vzeroupper();
 578   // VM needs caller's callsite
 579   // VM needs target method
 580   // This needs to be a long call since we will relocate this adapter to
 581   // the codeBuffer and it may not reach
 582 
 583   // Allocate argument register save area
 584   if (frame::arg_reg_save_area_bytes != 0) {
 585     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 586   }
 587   __ mov(c_rarg0, rbx);
 588   __ mov(c_rarg1, rax);
 589   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 590 
 591   // De-allocate argument register save area
 592   if (frame::arg_reg_save_area_bytes != 0) {
 593     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 594   }
 595 
 596   __ vzeroupper();
 597   __ pop_CPU_state();
 598   // restore sp
 599   __ mov(rsp, r13);
 600   __ bind(L);
 601 }
 602 
 603 
 604 static void gen_c2i_adapter(MacroAssembler *masm,
 605                             int total_args_passed,
 606                             int comp_args_on_stack,
 607                             const BasicType *sig_bt,
 608                             const VMRegPair *regs,
 609                             Label& skip_fixup) {
 610   // Before we get into the guts of the C2I adapter, see if we should be here
 611   // at all.  We've come from compiled code and are attempting to jump to the
 612   // interpreter, which means the caller made a static call to get here
 613   // (vcalls always get a compiled target if there is one).  Check for a
 614   // compiled target.  If there is one, we need to patch the caller's call.
 615   patch_callers_callsite(masm);
 616 
 617   __ bind(skip_fixup);
 618 
 619   // Since all args are passed on the stack, total_args_passed *
 620   // Interpreter::stackElementSize is the space we need. Plus 1 because
 621   // we also account for the return address location since
 622   // we store it first rather than hold it in rax across all the shuffling
 623 
 624   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 625 
 626   // stack is aligned, keep it that way
 627   extraspace = align_up(extraspace, 2*wordSize);
 628 
 629   // Get return address
 630   __ pop(rax);
 631 
 632   // set senderSP value
 633   __ mov(r13, rsp);
 634 
 635   __ subptr(rsp, extraspace);
 636 
 637   // Store the return address in the expected location
 638   __ movptr(Address(rsp, 0), rax);
 639 
 640   // Now write the args into the outgoing interpreter space
 641   for (int i = 0; i < total_args_passed; i++) {
 642     if (sig_bt[i] == T_VOID) {
 643       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 644       continue;
 645     }
 646 
 647     // offset to start parameters
 648     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 649     int next_off = st_off - Interpreter::stackElementSize;
 650 
 651     // Say 4 args:
 652     // i   st_off
 653     // 0   32 T_LONG
 654     // 1   24 T_VOID
 655     // 2   16 T_OBJECT
 656     // 3    8 T_BOOL
 657     // -    0 return address
 658     //
 659     // However to make thing extra confusing. Because we can fit a long/double in
 660     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 661     // leaves one slot empty and only stores to a single slot. In this case the
 662     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 663 
 664     VMReg r_1 = regs[i].first();
 665     VMReg r_2 = regs[i].second();
 666     if (!r_1->is_valid()) {
 667       assert(!r_2->is_valid(), "");
 668       continue;
 669     }
 670     if (r_1->is_stack()) {
 671       // memory to memory use rax
 672       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 673       if (!r_2->is_valid()) {
 674         // sign extend??
 675         __ movl(rax, Address(rsp, ld_off));
 676         __ movptr(Address(rsp, st_off), rax);
 677 
 678       } else {
 679 
 680         __ movq(rax, Address(rsp, ld_off));
 681 
 682         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 683         // T_DOUBLE and T_LONG use two slots in the interpreter
 684         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 685           // ld_off == LSW, ld_off+wordSize == MSW
 686           // st_off == MSW, next_off == LSW
 687           __ movq(Address(rsp, next_off), rax);
 688 #ifdef ASSERT
 689           // Overwrite the unused slot with known junk
 690           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 691           __ movptr(Address(rsp, st_off), rax);
 692 #endif /* ASSERT */
 693         } else {
 694           __ movq(Address(rsp, st_off), rax);
 695         }
 696       }
 697     } else if (r_1->is_Register()) {
 698       Register r = r_1->as_Register();
 699       if (!r_2->is_valid()) {
 700         // must be only an int (or less ) so move only 32bits to slot
 701         // why not sign extend??
 702         __ movl(Address(rsp, st_off), r);
 703       } else {
 704         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 705         // T_DOUBLE and T_LONG use two slots in the interpreter
 706         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 707           // long/double in gpr
 708 #ifdef ASSERT
 709           // Overwrite the unused slot with known junk
 710           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 711           __ movptr(Address(rsp, st_off), rax);
 712 #endif /* ASSERT */
 713           __ movq(Address(rsp, next_off), r);
 714         } else {
 715           __ movptr(Address(rsp, st_off), r);
 716         }
 717       }
 718     } else {
 719       assert(r_1->is_XMMRegister(), "");
 720       if (!r_2->is_valid()) {
 721         // only a float use just part of the slot
 722         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 723       } else {
 724 #ifdef ASSERT
 725         // Overwrite the unused slot with known junk
 726         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 727         __ movptr(Address(rsp, st_off), rax);
 728 #endif /* ASSERT */
 729         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 730       }
 731     }
 732   }
 733 
 734   // Schedule the branch target address early.
 735   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 736   __ jmp(rcx);
 737 }
 738 
 739 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 740                         address code_start, address code_end,
 741                         Label& L_ok) {
 742   Label L_fail;
 743   __ lea(temp_reg, ExternalAddress(code_start));
 744   __ cmpptr(pc_reg, temp_reg);
 745   __ jcc(Assembler::belowEqual, L_fail);
 746   __ lea(temp_reg, ExternalAddress(code_end));
 747   __ cmpptr(pc_reg, temp_reg);
 748   __ jcc(Assembler::below, L_ok);
 749   __ bind(L_fail);
 750 }
 751 
 752 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 753                                     int total_args_passed,
 754                                     int comp_args_on_stack,
 755                                     const BasicType *sig_bt,
 756                                     const VMRegPair *regs) {
 757 
 758   // Note: r13 contains the senderSP on entry. We must preserve it since
 759   // we may do a i2c -> c2i transition if we lose a race where compiled
 760   // code goes non-entrant while we get args ready.
 761   // In addition we use r13 to locate all the interpreter args as
 762   // we must align the stack to 16 bytes on an i2c entry else we
 763   // lose alignment we expect in all compiled code and register
 764   // save code can segv when fxsave instructions find improperly
 765   // aligned stack pointer.
 766 
 767   // Adapters can be frameless because they do not require the caller
 768   // to perform additional cleanup work, such as correcting the stack pointer.
 769   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 770   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 771   // even if a callee has modified the stack pointer.
 772   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 773   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 774   // up via the senderSP register).
 775   // In other words, if *either* the caller or callee is interpreted, we can
 776   // get the stack pointer repaired after a call.
 777   // This is why c2i and i2c adapters cannot be indefinitely composed.
 778   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 779   // both caller and callee would be compiled methods, and neither would
 780   // clean up the stack pointer changes performed by the two adapters.
 781   // If this happens, control eventually transfers back to the compiled
 782   // caller, but with an uncorrected stack, causing delayed havoc.
 783 
 784   // Pick up the return address
 785   __ movptr(rax, Address(rsp, 0));
 786 
 787   if (VerifyAdapterCalls &&
 788       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 789     // So, let's test for cascading c2i/i2c adapters right now.
 790     //  assert(Interpreter::contains($return_addr) ||
 791     //         StubRoutines::contains($return_addr),
 792     //         "i2c adapter must return to an interpreter frame");
 793     __ block_comment("verify_i2c { ");
 794     Label L_ok;
 795     if (Interpreter::code() != NULL)
 796       range_check(masm, rax, r11,
 797                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 798                   L_ok);
 799     if (StubRoutines::code1() != NULL)
 800       range_check(masm, rax, r11,
 801                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 802                   L_ok);
 803     if (StubRoutines::code2() != NULL)
 804       range_check(masm, rax, r11,
 805                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 806                   L_ok);
 807     const char* msg = "i2c adapter must return to an interpreter frame";
 808     __ block_comment(msg);
 809     __ stop(msg);
 810     __ bind(L_ok);
 811     __ block_comment("} verify_i2ce ");
 812   }
 813 
 814   // Must preserve original SP for loading incoming arguments because
 815   // we need to align the outgoing SP for compiled code.
 816   __ movptr(r11, rsp);
 817 
 818   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
 819   // in registers, we will occasionally have no stack args.
 820   int comp_words_on_stack = 0;
 821   if (comp_args_on_stack) {
 822     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
 823     // registers are below.  By subtracting stack0, we either get a negative
 824     // number (all values in registers) or the maximum stack slot accessed.
 825 
 826     // Convert 4-byte c2 stack slots to words.
 827     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 828     // Round up to miminum stack alignment, in wordSize
 829     comp_words_on_stack = align_up(comp_words_on_stack, 2);
 830     __ subptr(rsp, comp_words_on_stack * wordSize);
 831   }
 832 
 833 
 834   // Ensure compiled code always sees stack at proper alignment
 835   __ andptr(rsp, -16);
 836 
 837   // push the return address and misalign the stack that youngest frame always sees
 838   // as far as the placement of the call instruction
 839   __ push(rax);
 840 
 841   // Put saved SP in another register
 842   const Register saved_sp = rax;
 843   __ movptr(saved_sp, r11);
 844 
 845   // Will jump to the compiled code just as if compiled code was doing it.
 846   // Pre-load the register-jump target early, to schedule it better.
 847   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 848 
 849 #if INCLUDE_JVMCI
 850   if (EnableJVMCI) {
 851     // check if this call should be routed towards a specific entry point
 852     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 853     Label no_alternative_target;
 854     __ jcc(Assembler::equal, no_alternative_target);
 855     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 856     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 857     __ bind(no_alternative_target);
 858   }
 859 #endif // INCLUDE_JVMCI
 860 
 861   // Now generate the shuffle code.  Pick up all register args and move the
 862   // rest through the floating point stack top.
 863   for (int i = 0; i < total_args_passed; i++) {
 864     if (sig_bt[i] == T_VOID) {
 865       // Longs and doubles are passed in native word order, but misaligned
 866       // in the 32-bit build.
 867       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 868       continue;
 869     }
 870 
 871     // Pick up 0, 1 or 2 words from SP+offset.
 872 
 873     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 874             "scrambled load targets?");
 875     // Load in argument order going down.
 876     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 877     // Point to interpreter value (vs. tag)
 878     int next_off = ld_off - Interpreter::stackElementSize;
 879     //
 880     //
 881     //
 882     VMReg r_1 = regs[i].first();
 883     VMReg r_2 = regs[i].second();
 884     if (!r_1->is_valid()) {
 885       assert(!r_2->is_valid(), "");
 886       continue;
 887     }
 888     if (r_1->is_stack()) {
 889       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 890       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 891 
 892       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 893       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 894       // will be generated.
 895       if (!r_2->is_valid()) {
 896         // sign extend???
 897         __ movl(r13, Address(saved_sp, ld_off));
 898         __ movptr(Address(rsp, st_off), r13);
 899       } else {
 900         //
 901         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 902         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 903         // So we must adjust where to pick up the data to match the interpreter.
 904         //
 905         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 906         // are accessed as negative so LSW is at LOW address
 907 
 908         // ld_off is MSW so get LSW
 909         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 910                            next_off : ld_off;
 911         __ movq(r13, Address(saved_sp, offset));
 912         // st_off is LSW (i.e. reg.first())
 913         __ movq(Address(rsp, st_off), r13);
 914       }
 915     } else if (r_1->is_Register()) {  // Register argument
 916       Register r = r_1->as_Register();
 917       assert(r != rax, "must be different");
 918       if (r_2->is_valid()) {
 919         //
 920         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 921         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 922         // So we must adjust where to pick up the data to match the interpreter.
 923 
 924         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 925                            next_off : ld_off;
 926 
 927         // this can be a misaligned move
 928         __ movq(r, Address(saved_sp, offset));
 929       } else {
 930         // sign extend and use a full word?
 931         __ movl(r, Address(saved_sp, ld_off));
 932       }
 933     } else {
 934       if (!r_2->is_valid()) {
 935         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 936       } else {
 937         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 938       }
 939     }
 940   }
 941 
 942   // 6243940 We might end up in handle_wrong_method if
 943   // the callee is deoptimized as we race thru here. If that
 944   // happens we don't want to take a safepoint because the
 945   // caller frame will look interpreted and arguments are now
 946   // "compiled" so it is much better to make this transition
 947   // invisible to the stack walking code. Unfortunately if
 948   // we try and find the callee by normal means a safepoint
 949   // is possible. So we stash the desired callee in the thread
 950   // and the vm will find there should this case occur.
 951 
 952   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 953 
 954   // put Method* where a c2i would expect should we end up there
 955   // only needed becaus eof c2 resolve stubs return Method* as a result in
 956   // rax
 957   __ mov(rax, rbx);
 958   __ jmp(r11);
 959 }
 960 
 961 // ---------------------------------------------------------------
 962 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 963                                                             int total_args_passed,
 964                                                             int comp_args_on_stack,
 965                                                             const BasicType *sig_bt,
 966                                                             const VMRegPair *regs,
 967                                                             AdapterFingerPrint* fingerprint) {
 968   address i2c_entry = __ pc();
 969 
 970   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 971 
 972   // -------------------------------------------------------------------------
 973   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 974   // to the interpreter.  The args start out packed in the compiled layout.  They
 975   // need to be unpacked into the interpreter layout.  This will almost always
 976   // require some stack space.  We grow the current (compiled) stack, then repack
 977   // the args.  We  finally end in a jump to the generic interpreter entry point.
 978   // On exit from the interpreter, the interpreter will restore our SP (lest the
 979   // compiled code, which relys solely on SP and not RBP, get sick).
 980 
 981   address c2i_unverified_entry = __ pc();
 982   Label skip_fixup;
 983   Label ok;
 984 
 985   Register holder = rax;
 986   Register receiver = j_rarg0;
 987   Register temp = rbx;
 988 
 989   {
 990     __ load_klass(temp, receiver, rscratch1);
 991     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
 992     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
 993     __ jcc(Assembler::equal, ok);
 994     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 995 
 996     __ bind(ok);
 997     // Method might have been compiled since the call site was patched to
 998     // interpreted if that is the case treat it as a miss so we can get
 999     // the call site corrected.
1000     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1001     __ jcc(Assembler::equal, skip_fixup);
1002     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1003   }
1004 
1005   address c2i_entry = __ pc();
1006 
1007   // Class initialization barrier for static methods
1008   address c2i_no_clinit_check_entry = NULL;
1009   if (VM_Version::supports_fast_class_init_checks()) {
1010     Label L_skip_barrier;
1011     Register method = rbx;
1012 
1013     { // Bypass the barrier for non-static methods
1014       Register flags  = rscratch1;
1015       __ movl(flags, Address(method, Method::access_flags_offset()));
1016       __ testl(flags, JVM_ACC_STATIC);
1017       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1018     }
1019 
1020     Register klass = rscratch1;
1021     __ load_method_holder(klass, method);
1022     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1023 
1024     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1025 
1026     __ bind(L_skip_barrier);
1027     c2i_no_clinit_check_entry = __ pc();
1028   }
1029 
1030   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1031   bs->c2i_entry_barrier(masm);
1032 
1033   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1034 
1035   __ flush();
1036   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1037 }
1038 
1039 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1040                                          VMRegPair *regs,
1041                                          VMRegPair *regs2,
1042                                          int total_args_passed) {
1043   assert(regs2 == NULL, "not needed on x86");
1044 // We return the amount of VMRegImpl stack slots we need to reserve for all
1045 // the arguments NOT counting out_preserve_stack_slots.
1046 
1047 // NOTE: These arrays will have to change when c1 is ported
1048 #ifdef _WIN64
1049     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1050       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1051     };
1052     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1053       c_farg0, c_farg1, c_farg2, c_farg3
1054     };
1055 #else
1056     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1057       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1058     };
1059     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1060       c_farg0, c_farg1, c_farg2, c_farg3,
1061       c_farg4, c_farg5, c_farg6, c_farg7
1062     };
1063 #endif // _WIN64
1064 
1065 
1066     uint int_args = 0;
1067     uint fp_args = 0;
1068     uint stk_args = 0; // inc by 2 each time
1069 
1070     for (int i = 0; i < total_args_passed; i++) {
1071       switch (sig_bt[i]) {
1072       case T_BOOLEAN:
1073       case T_CHAR:
1074       case T_BYTE:
1075       case T_SHORT:
1076       case T_INT:
1077         if (int_args < Argument::n_int_register_parameters_c) {
1078           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1079 #ifdef _WIN64
1080           fp_args++;
1081           // Allocate slots for callee to stuff register args the stack.
1082           stk_args += 2;
1083 #endif
1084         } else {
1085           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1086           stk_args += 2;
1087         }
1088         break;
1089       case T_LONG:
1090         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1091         // fall through
1092       case T_OBJECT:
1093       case T_ARRAY:
1094       case T_ADDRESS:
1095       case T_METADATA:
1096         if (int_args < Argument::n_int_register_parameters_c) {
1097           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1098 #ifdef _WIN64
1099           fp_args++;
1100           stk_args += 2;
1101 #endif
1102         } else {
1103           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1104           stk_args += 2;
1105         }
1106         break;
1107       case T_FLOAT:
1108         if (fp_args < Argument::n_float_register_parameters_c) {
1109           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1110 #ifdef _WIN64
1111           int_args++;
1112           // Allocate slots for callee to stuff register args the stack.
1113           stk_args += 2;
1114 #endif
1115         } else {
1116           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1117           stk_args += 2;
1118         }
1119         break;
1120       case T_DOUBLE:
1121         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1122         if (fp_args < Argument::n_float_register_parameters_c) {
1123           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1124 #ifdef _WIN64
1125           int_args++;
1126           // Allocate slots for callee to stuff register args the stack.
1127           stk_args += 2;
1128 #endif
1129         } else {
1130           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1131           stk_args += 2;
1132         }
1133         break;
1134       case T_VOID: // Halves of longs and doubles
1135         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1136         regs[i].set_bad();
1137         break;
1138       default:
1139         ShouldNotReachHere();
1140         break;
1141       }
1142     }
1143 #ifdef _WIN64
1144   // windows abi requires that we always allocate enough stack space
1145   // for 4 64bit registers to be stored down.
1146   if (stk_args < 8) {
1147     stk_args = 8;
1148   }
1149 #endif // _WIN64
1150 
1151   return stk_args;
1152 }
1153 
1154 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1155                                              uint num_bits,
1156                                              uint total_args_passed) {
1157   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1158          "only certain vector sizes are supported for now");
1159 
1160   static const XMMRegister VEC_ArgReg[32] = {
1161      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1162      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1163     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1164     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1165   };
1166 
1167   uint stk_args = 0;
1168   uint fp_args = 0;
1169 
1170   for (uint i = 0; i < total_args_passed; i++) {
1171     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1172     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1173     regs[i].set_pair(vmreg->next(next_val), vmreg);
1174   }
1175 
1176   return stk_args;
1177 }
1178 
1179 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1180   // We always ignore the frame_slots arg and just use the space just below frame pointer
1181   // which by this time is free to use
1182   switch (ret_type) {
1183   case T_FLOAT:
1184     __ movflt(Address(rbp, -wordSize), xmm0);
1185     break;
1186   case T_DOUBLE:
1187     __ movdbl(Address(rbp, -wordSize), xmm0);
1188     break;
1189   case T_VOID:  break;
1190   default: {
1191     __ movptr(Address(rbp, -wordSize), rax);
1192     }
1193   }
1194 }
1195 
1196 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1197   // We always ignore the frame_slots arg and just use the space just below frame pointer
1198   // which by this time is free to use
1199   switch (ret_type) {
1200   case T_FLOAT:
1201     __ movflt(xmm0, Address(rbp, -wordSize));
1202     break;
1203   case T_DOUBLE:
1204     __ movdbl(xmm0, Address(rbp, -wordSize));
1205     break;
1206   case T_VOID:  break;
1207   default: {
1208     __ movptr(rax, Address(rbp, -wordSize));
1209     }
1210   }
1211 }
1212 
1213 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1214     for ( int i = first_arg ; i < arg_count ; i++ ) {
1215       if (args[i].first()->is_Register()) {
1216         __ push(args[i].first()->as_Register());
1217       } else if (args[i].first()->is_XMMRegister()) {
1218         __ subptr(rsp, 2*wordSize);
1219         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1220       }
1221     }
1222 }
1223 
1224 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1225     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1226       if (args[i].first()->is_Register()) {
1227         __ pop(args[i].first()->as_Register());
1228       } else if (args[i].first()->is_XMMRegister()) {
1229         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1230         __ addptr(rsp, 2*wordSize);
1231       }
1232     }
1233 }
1234 
1235 // Unpack an array argument into a pointer to the body and the length
1236 // if the array is non-null, otherwise pass 0 for both.
1237 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
1238   Register tmp_reg = rax;
1239   assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
1240          "possible collision");
1241   assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
1242          "possible collision");
1243 
1244   __ block_comment("unpack_array_argument {");
1245 
1246   // Pass the length, ptr pair
1247   Label is_null, done;
1248   VMRegPair tmp;
1249   tmp.set_ptr(tmp_reg->as_VMReg());
1250   if (reg.first()->is_stack()) {
1251     // Load the arg up from the stack
1252     __ move_ptr(reg, tmp);
1253     reg = tmp;
1254   }
1255   __ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1256   __ jccb(Assembler::equal, is_null);
1257   __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1258   __ move_ptr(tmp, body_arg);
1259   // load the length relative to the body.
1260   __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() -
1261                            arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1262   __ move32_64(tmp, length_arg);
1263   __ jmpb(done);
1264   __ bind(is_null);
1265   // Pass zeros
1266   __ xorptr(tmp_reg, tmp_reg);
1267   __ move_ptr(tmp, body_arg);
1268   __ move32_64(tmp, length_arg);
1269   __ bind(done);
1270 
1271   __ block_comment("} unpack_array_argument");
1272 }
1273 
1274 
1275 // Different signatures may require very different orders for the move
1276 // to avoid clobbering other arguments.  There's no simple way to
1277 // order them safely.  Compute a safe order for issuing stores and
1278 // break any cycles in those stores.  This code is fairly general but
1279 // it's not necessary on the other platforms so we keep it in the
1280 // platform dependent code instead of moving it into a shared file.
1281 // (See bugs 7013347 & 7145024.)
1282 // Note that this code is specific to LP64.
1283 class ComputeMoveOrder: public StackObj {
1284   class MoveOperation: public ResourceObj {
1285     friend class ComputeMoveOrder;
1286    private:
1287     VMRegPair        _src;
1288     VMRegPair        _dst;
1289     int              _src_index;
1290     int              _dst_index;
1291     bool             _processed;
1292     MoveOperation*  _next;
1293     MoveOperation*  _prev;
1294 
1295     static int get_id(VMRegPair r) {
1296       return r.first()->value();
1297     }
1298 
1299    public:
1300     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1301       _src(src)
1302     , _dst(dst)
1303     , _src_index(src_index)
1304     , _dst_index(dst_index)
1305     , _processed(false)
1306     , _next(NULL)
1307     , _prev(NULL) {
1308     }
1309 
1310     VMRegPair src() const              { return _src; }
1311     int src_id() const                 { return get_id(src()); }
1312     int src_index() const              { return _src_index; }
1313     VMRegPair dst() const              { return _dst; }
1314     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1315     int dst_index() const              { return _dst_index; }
1316     int dst_id() const                 { return get_id(dst()); }
1317     MoveOperation* next() const       { return _next; }
1318     MoveOperation* prev() const       { return _prev; }
1319     void set_processed()               { _processed = true; }
1320     bool is_processed() const          { return _processed; }
1321 
1322     // insert
1323     void break_cycle(VMRegPair temp_register) {
1324       // create a new store following the last store
1325       // to move from the temp_register to the original
1326       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1327 
1328       // break the cycle of links and insert new_store at the end
1329       // break the reverse link.
1330       MoveOperation* p = prev();
1331       assert(p->next() == this, "must be");
1332       _prev = NULL;
1333       p->_next = new_store;
1334       new_store->_prev = p;
1335 
1336       // change the original store to save it's value in the temp.
1337       set_dst(-1, temp_register);
1338     }
1339 
1340     void link(GrowableArray<MoveOperation*>& killer) {
1341       // link this store in front the store that it depends on
1342       MoveOperation* n = killer.at_grow(src_id(), NULL);
1343       if (n != NULL) {
1344         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1345         _next = n;
1346         n->_prev = this;
1347       }
1348     }
1349   };
1350 
1351  private:
1352   GrowableArray<MoveOperation*> edges;
1353 
1354  public:
1355   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1356                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1357     // Move operations where the dest is the stack can all be
1358     // scheduled first since they can't interfere with the other moves.
1359     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1360       if (in_sig_bt[i] == T_ARRAY) {
1361         c_arg--;
1362         if (out_regs[c_arg].first()->is_stack() &&
1363             out_regs[c_arg + 1].first()->is_stack()) {
1364           arg_order.push(i);
1365           arg_order.push(c_arg);
1366         } else {
1367           if (out_regs[c_arg].first()->is_stack() ||
1368               in_regs[i].first() == out_regs[c_arg].first()) {
1369             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1370           } else {
1371             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1372           }
1373         }
1374       } else if (in_sig_bt[i] == T_VOID) {
1375         arg_order.push(i);
1376         arg_order.push(c_arg);
1377       } else {
1378         if (out_regs[c_arg].first()->is_stack() ||
1379             in_regs[i].first() == out_regs[c_arg].first()) {
1380           arg_order.push(i);
1381           arg_order.push(c_arg);
1382         } else {
1383           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1384         }
1385       }
1386     }
1387     // Break any cycles in the register moves and emit the in the
1388     // proper order.
1389     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1390     for (int i = 0; i < stores->length(); i++) {
1391       arg_order.push(stores->at(i)->src_index());
1392       arg_order.push(stores->at(i)->dst_index());
1393     }
1394  }
1395 
1396   // Collected all the move operations
1397   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1398     if (src.first() == dst.first()) return;
1399     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1400   }
1401 
1402   // Walk the edges breaking cycles between moves.  The result list
1403   // can be walked in order to produce the proper set of loads
1404   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1405     // Record which moves kill which values
1406     GrowableArray<MoveOperation*> killer;
1407     for (int i = 0; i < edges.length(); i++) {
1408       MoveOperation* s = edges.at(i);
1409       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1410       killer.at_put_grow(s->dst_id(), s, NULL);
1411     }
1412     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1413            "make sure temp isn't in the registers that are killed");
1414 
1415     // create links between loads and stores
1416     for (int i = 0; i < edges.length(); i++) {
1417       edges.at(i)->link(killer);
1418     }
1419 
1420     // at this point, all the move operations are chained together
1421     // in a doubly linked list.  Processing it backwards finds
1422     // the beginning of the chain, forwards finds the end.  If there's
1423     // a cycle it can be broken at any point,  so pick an edge and walk
1424     // backward until the list ends or we end where we started.
1425     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1426     for (int e = 0; e < edges.length(); e++) {
1427       MoveOperation* s = edges.at(e);
1428       if (!s->is_processed()) {
1429         MoveOperation* start = s;
1430         // search for the beginning of the chain or cycle
1431         while (start->prev() != NULL && start->prev() != s) {
1432           start = start->prev();
1433         }
1434         if (start->prev() == s) {
1435           start->break_cycle(temp_register);
1436         }
1437         // walk the chain forward inserting to store list
1438         while (start != NULL) {
1439           stores->append(start);
1440           start->set_processed();
1441           start = start->next();
1442         }
1443       }
1444     }
1445     return stores;
1446   }
1447 };
1448 
1449 static void verify_oop_args(MacroAssembler* masm,
1450                             const methodHandle& method,
1451                             const BasicType* sig_bt,
1452                             const VMRegPair* regs) {
1453   Register temp_reg = rbx;  // not part of any compiled calling seq
1454   if (VerifyOops) {
1455     for (int i = 0; i < method->size_of_parameters(); i++) {
1456       if (is_reference_type(sig_bt[i])) {
1457         VMReg r = regs[i].first();
1458         assert(r->is_valid(), "bad oop arg");
1459         if (r->is_stack()) {
1460           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1461           __ verify_oop(temp_reg);
1462         } else {
1463           __ verify_oop(r->as_Register());
1464         }
1465       }
1466     }
1467   }
1468 }
1469 
1470 static void gen_special_dispatch(MacroAssembler* masm,
1471                                  const methodHandle& method,
1472                                  const BasicType* sig_bt,
1473                                  const VMRegPair* regs) {
1474   verify_oop_args(masm, method, sig_bt, regs);
1475   vmIntrinsics::ID iid = method->intrinsic_id();
1476 
1477   // Now write the args into the outgoing interpreter space
1478   bool     has_receiver   = false;
1479   Register receiver_reg   = noreg;
1480   int      member_arg_pos = -1;
1481   Register member_reg     = noreg;
1482   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1483   if (ref_kind != 0) {
1484     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1485     member_reg = rbx;  // known to be free at this point
1486     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1487   } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1488     has_receiver = true;
1489   } else {
1490     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1491   }
1492 
1493   if (member_reg != noreg) {
1494     // Load the member_arg into register, if necessary.
1495     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1496     VMReg r = regs[member_arg_pos].first();
1497     if (r->is_stack()) {
1498       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1499     } else {
1500       // no data motion is needed
1501       member_reg = r->as_Register();
1502     }
1503   }
1504 
1505   if (has_receiver) {
1506     // Make sure the receiver is loaded into a register.
1507     assert(method->size_of_parameters() > 0, "oob");
1508     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1509     VMReg r = regs[0].first();
1510     assert(r->is_valid(), "bad receiver arg");
1511     if (r->is_stack()) {
1512       // Porting note:  This assumes that compiled calling conventions always
1513       // pass the receiver oop in a register.  If this is not true on some
1514       // platform, pick a temp and load the receiver from stack.
1515       fatal("receiver always in a register");
1516       receiver_reg = j_rarg0;  // known to be free at this point
1517       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1518     } else {
1519       // no data motion is needed
1520       receiver_reg = r->as_Register();
1521     }
1522   }
1523 
1524   // Figure out which address we are really jumping to:
1525   MethodHandles::generate_method_handle_dispatch(masm, iid,
1526                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1527 }
1528 
1529 // ---------------------------------------------------------------------------
1530 // Generate a native wrapper for a given method.  The method takes arguments
1531 // in the Java compiled code convention, marshals them to the native
1532 // convention (handlizes oops, etc), transitions to native, makes the call,
1533 // returns to java state (possibly blocking), unhandlizes any result and
1534 // returns.
1535 //
1536 // Critical native functions are a shorthand for the use of
1537 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1538 // functions.  The wrapper is expected to unpack the arguments before
1539 // passing them to the callee. Critical native functions leave the state _in_Java,
1540 // since they cannot stop for GC.
1541 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1542 // block and the check for pending exceptions it's impossible for them
1543 // to be thrown.
1544 //
1545 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1546                                                 const methodHandle& method,
1547                                                 int compile_id,
1548                                                 BasicType* in_sig_bt,
1549                                                 VMRegPair* in_regs,
1550                                                 BasicType ret_type,
1551                                                 address critical_entry) {
1552   if (method->is_method_handle_intrinsic()) {
1553     vmIntrinsics::ID iid = method->intrinsic_id();
1554     intptr_t start = (intptr_t)__ pc();
1555     int vep_offset = ((intptr_t)__ pc()) - start;
1556     gen_special_dispatch(masm,
1557                          method,
1558                          in_sig_bt,
1559                          in_regs);
1560     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1561     __ flush();
1562     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1563     return nmethod::new_native_nmethod(method,
1564                                        compile_id,
1565                                        masm->code(),
1566                                        vep_offset,
1567                                        frame_complete,
1568                                        stack_slots / VMRegImpl::slots_per_word,
1569                                        in_ByteSize(-1),
1570                                        in_ByteSize(-1),
1571                                        (OopMapSet*)NULL);
1572   }
1573   bool is_critical_native = true;
1574   address native_func = critical_entry;
1575   if (native_func == NULL) {
1576     native_func = method->native_function();
1577     is_critical_native = false;
1578   }
1579   assert(native_func != NULL, "must have function");
1580 
1581   // An OopMap for lock (and class if static)
1582   OopMapSet *oop_maps = new OopMapSet();
1583   intptr_t start = (intptr_t)__ pc();
1584 
1585   // We have received a description of where all the java arg are located
1586   // on entry to the wrapper. We need to convert these args to where
1587   // the jni function will expect them. To figure out where they go
1588   // we convert the java signature to a C signature by inserting
1589   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1590 
1591   const int total_in_args = method->size_of_parameters();
1592   int total_c_args = total_in_args;
1593   if (!is_critical_native) {
1594     total_c_args += 1;
1595     if (method->is_static()) {
1596       total_c_args++;
1597     }
1598   } else {
1599     for (int i = 0; i < total_in_args; i++) {
1600       if (in_sig_bt[i] == T_ARRAY) {
1601         total_c_args++;
1602       }
1603     }
1604   }
1605 
1606   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1607   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1608   BasicType* in_elem_bt = NULL;
1609 
1610   int argc = 0;
1611   if (!is_critical_native) {
1612     out_sig_bt[argc++] = T_ADDRESS;
1613     if (method->is_static()) {
1614       out_sig_bt[argc++] = T_OBJECT;
1615     }
1616 
1617     for (int i = 0; i < total_in_args ; i++ ) {
1618       out_sig_bt[argc++] = in_sig_bt[i];
1619     }
1620   } else {
1621     in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
1622     SignatureStream ss(method->signature());
1623     for (int i = 0; i < total_in_args ; i++ ) {
1624       if (in_sig_bt[i] == T_ARRAY) {
1625         // Arrays are passed as int, elem* pair
1626         out_sig_bt[argc++] = T_INT;
1627         out_sig_bt[argc++] = T_ADDRESS;
1628         ss.skip_array_prefix(1);  // skip one '['
1629         assert(ss.is_primitive(), "primitive type expected");
1630         in_elem_bt[i] = ss.type();
1631       } else {
1632         out_sig_bt[argc++] = in_sig_bt[i];
1633         in_elem_bt[i] = T_VOID;
1634       }
1635       if (in_sig_bt[i] != T_VOID) {
1636         assert(in_sig_bt[i] == ss.type() ||
1637                in_sig_bt[i] == T_ARRAY, "must match");
1638         ss.next();
1639       }
1640     }
1641   }
1642 
1643   // Now figure out where the args must be stored and how much stack space
1644   // they require.
1645   int out_arg_slots;
1646   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1647 
1648   // Compute framesize for the wrapper.  We need to handlize all oops in
1649   // incoming registers
1650 
1651   // Calculate the total number of stack slots we will need.
1652 
1653   // First count the abi requirement plus all of the outgoing args
1654   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1655 
1656   // Now the space for the inbound oop handle area
1657   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1658   if (is_critical_native) {
1659     // Critical natives may have to call out so they need a save area
1660     // for register arguments.
1661     int double_slots = 0;
1662     int single_slots = 0;
1663     for ( int i = 0; i < total_in_args; i++) {
1664       if (in_regs[i].first()->is_Register()) {
1665         const Register reg = in_regs[i].first()->as_Register();
1666         switch (in_sig_bt[i]) {
1667           case T_BOOLEAN:
1668           case T_BYTE:
1669           case T_SHORT:
1670           case T_CHAR:
1671           case T_INT:  single_slots++; break;
1672           case T_ARRAY:  // specific to LP64 (7145024)
1673           case T_LONG: double_slots++; break;
1674           default:  ShouldNotReachHere();
1675         }
1676       } else if (in_regs[i].first()->is_XMMRegister()) {
1677         switch (in_sig_bt[i]) {
1678           case T_FLOAT:  single_slots++; break;
1679           case T_DOUBLE: double_slots++; break;
1680           default:  ShouldNotReachHere();
1681         }
1682       } else if (in_regs[i].first()->is_FloatRegister()) {
1683         ShouldNotReachHere();
1684       }
1685     }
1686     total_save_slots = double_slots * 2 + single_slots;
1687     // align the save area
1688     if (double_slots != 0) {
1689       stack_slots = align_up(stack_slots, 2);
1690     }
1691   }
1692 
1693   int oop_handle_offset = stack_slots;
1694   stack_slots += total_save_slots;
1695 
1696   // Now any space we need for handlizing a klass if static method
1697 
1698   int klass_slot_offset = 0;
1699   int klass_offset = -1;
1700   int lock_slot_offset = 0;
1701   bool is_static = false;
1702 
1703   if (method->is_static()) {
1704     klass_slot_offset = stack_slots;
1705     stack_slots += VMRegImpl::slots_per_word;
1706     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1707     is_static = true;
1708   }
1709 
1710   // Plus a lock if needed
1711 
1712   if (method->is_synchronized()) {
1713     lock_slot_offset = stack_slots;
1714     stack_slots += VMRegImpl::slots_per_word;
1715   }
1716 
1717   // Now a place (+2) to save return values or temp during shuffling
1718   // + 4 for return address (which we own) and saved rbp
1719   stack_slots += 6;
1720 
1721   // Ok The space we have allocated will look like:
1722   //
1723   //
1724   // FP-> |                     |
1725   //      |---------------------|
1726   //      | 2 slots for moves   |
1727   //      |---------------------|
1728   //      | lock box (if sync)  |
1729   //      |---------------------| <- lock_slot_offset
1730   //      | klass (if static)   |
1731   //      |---------------------| <- klass_slot_offset
1732   //      | oopHandle area      |
1733   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1734   //      | outbound memory     |
1735   //      | based arguments     |
1736   //      |                     |
1737   //      |---------------------|
1738   //      |                     |
1739   // SP-> | out_preserved_slots |
1740   //
1741   //
1742 
1743 
1744   // Now compute actual number of stack words we need rounding to make
1745   // stack properly aligned.
1746   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1747 
1748   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1749 
1750   // First thing make an ic check to see if we should even be here
1751 
1752   // We are free to use all registers as temps without saving them and
1753   // restoring them except rbp. rbp is the only callee save register
1754   // as far as the interpreter and the compiler(s) are concerned.
1755 
1756 
1757   const Register ic_reg = rax;
1758   const Register receiver = j_rarg0;
1759 
1760   Label hit;
1761   Label exception_pending;
1762 
1763   assert_different_registers(ic_reg, receiver, rscratch1);
1764   __ verify_oop(receiver);
1765   __ load_klass(rscratch1, receiver, rscratch2);
1766   __ cmpq(ic_reg, rscratch1);
1767   __ jcc(Assembler::equal, hit);
1768 
1769   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1770 
1771   // Verified entry point must be aligned
1772   __ align(8);
1773 
1774   __ bind(hit);
1775 
1776   int vep_offset = ((intptr_t)__ pc()) - start;
1777 
1778   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1779     Label L_skip_barrier;
1780     Register klass = r10;
1781     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1782     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1783 
1784     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1785 
1786     __ bind(L_skip_barrier);
1787   }
1788 
1789 #ifdef COMPILER1
1790   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1791   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1792     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1793   }
1794 #endif // COMPILER1
1795 
1796   // The instruction at the verified entry point must be 5 bytes or longer
1797   // because it can be patched on the fly by make_non_entrant. The stack bang
1798   // instruction fits that requirement.
1799 
1800   // Generate stack overflow check
1801   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1802 
1803   // Generate a new frame for the wrapper.
1804   __ enter();
1805   // -2 because return address is already present and so is saved rbp
1806   __ subptr(rsp, stack_size - 2*wordSize);
1807 
1808   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1809   bs->nmethod_entry_barrier(masm);
1810 
1811   // Frame is now completed as far as size and linkage.
1812   int frame_complete = ((intptr_t)__ pc()) - start;
1813 
1814     if (UseRTMLocking) {
1815       // Abort RTM transaction before calling JNI
1816       // because critical section will be large and will be
1817       // aborted anyway. Also nmethod could be deoptimized.
1818       __ xabort(0);
1819     }
1820 
1821 #ifdef ASSERT
1822     {
1823       Label L;
1824       __ mov(rax, rsp);
1825       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1826       __ cmpptr(rax, rsp);
1827       __ jcc(Assembler::equal, L);
1828       __ stop("improperly aligned stack");
1829       __ bind(L);
1830     }
1831 #endif /* ASSERT */
1832 
1833 
1834   // We use r14 as the oop handle for the receiver/klass
1835   // It is callee save so it survives the call to native
1836 
1837   const Register oop_handle_reg = r14;
1838 
1839   //
1840   // We immediately shuffle the arguments so that any vm call we have to
1841   // make from here on out (sync slow path, jvmti, etc.) we will have
1842   // captured the oops from our caller and have a valid oopMap for
1843   // them.
1844 
1845   // -----------------
1846   // The Grand Shuffle
1847 
1848   // The Java calling convention is either equal (linux) or denser (win64) than the
1849   // c calling convention. However the because of the jni_env argument the c calling
1850   // convention always has at least one more (and two for static) arguments than Java.
1851   // Therefore if we move the args from java -> c backwards then we will never have
1852   // a register->register conflict and we don't have to build a dependency graph
1853   // and figure out how to break any cycles.
1854   //
1855 
1856   // Record esp-based slot for receiver on stack for non-static methods
1857   int receiver_offset = -1;
1858 
1859   // This is a trick. We double the stack slots so we can claim
1860   // the oops in the caller's frame. Since we are sure to have
1861   // more args than the caller doubling is enough to make
1862   // sure we can capture all the incoming oop args from the
1863   // caller.
1864   //
1865   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1866 
1867   // Mark location of rbp (someday)
1868   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1869 
1870   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1871   // All inbound args are referenced based on rbp and all outbound args via rsp.
1872 
1873 
1874 #ifdef ASSERT
1875   bool reg_destroyed[RegisterImpl::number_of_registers];
1876   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1877   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1878     reg_destroyed[r] = false;
1879   }
1880   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1881     freg_destroyed[f] = false;
1882   }
1883 
1884 #endif /* ASSERT */
1885 
1886   // This may iterate in two different directions depending on the
1887   // kind of native it is.  The reason is that for regular JNI natives
1888   // the incoming and outgoing registers are offset upwards and for
1889   // critical natives they are offset down.
1890   GrowableArray<int> arg_order(2 * total_in_args);
1891 
1892   VMRegPair tmp_vmreg;
1893   tmp_vmreg.set2(rbx->as_VMReg());
1894 
1895   if (!is_critical_native) {
1896     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1897       arg_order.push(i);
1898       arg_order.push(c_arg);
1899     }
1900   } else {
1901     // Compute a valid move order, using tmp_vmreg to break any cycles
1902     ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
1903   }
1904 
1905   int temploc = -1;
1906   for (int ai = 0; ai < arg_order.length(); ai += 2) {
1907     int i = arg_order.at(ai);
1908     int c_arg = arg_order.at(ai + 1);
1909     __ block_comment(err_msg("move %d -> %d", i, c_arg));
1910     if (c_arg == -1) {
1911       assert(is_critical_native, "should only be required for critical natives");
1912       // This arg needs to be moved to a temporary
1913       __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
1914       in_regs[i] = tmp_vmreg;
1915       temploc = i;
1916       continue;
1917     } else if (i == -1) {
1918       assert(is_critical_native, "should only be required for critical natives");
1919       // Read from the temporary location
1920       assert(temploc != -1, "must be valid");
1921       i = temploc;
1922       temploc = -1;
1923     }
1924 #ifdef ASSERT
1925     if (in_regs[i].first()->is_Register()) {
1926       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1927     } else if (in_regs[i].first()->is_XMMRegister()) {
1928       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1929     }
1930     if (out_regs[c_arg].first()->is_Register()) {
1931       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1932     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1933       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1934     }
1935 #endif /* ASSERT */
1936     switch (in_sig_bt[i]) {
1937       case T_ARRAY:
1938         if (is_critical_native) {
1939           unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
1940           c_arg++;
1941 #ifdef ASSERT
1942           if (out_regs[c_arg].first()->is_Register()) {
1943             reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1944           } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1945             freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1946           }
1947 #endif
1948           break;
1949         }
1950       case T_OBJECT:
1951         assert(!is_critical_native, "no oop arguments");
1952         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1953                     ((i == 0) && (!is_static)),
1954                     &receiver_offset);
1955         break;
1956       case T_VOID:
1957         break;
1958 
1959       case T_FLOAT:
1960         __ float_move(in_regs[i], out_regs[c_arg]);
1961           break;
1962 
1963       case T_DOUBLE:
1964         assert( i + 1 < total_in_args &&
1965                 in_sig_bt[i + 1] == T_VOID &&
1966                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1967         __ double_move(in_regs[i], out_regs[c_arg]);
1968         break;
1969 
1970       case T_LONG :
1971         __ long_move(in_regs[i], out_regs[c_arg]);
1972         break;
1973 
1974       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1975 
1976       default:
1977         __ move32_64(in_regs[i], out_regs[c_arg]);
1978     }
1979   }
1980 
1981   int c_arg;
1982 
1983   // Pre-load a static method's oop into r14.  Used both by locking code and
1984   // the normal JNI call code.
1985   if (!is_critical_native) {
1986     // point c_arg at the first arg that is already loaded in case we
1987     // need to spill before we call out
1988     c_arg = total_c_args - total_in_args;
1989 
1990     if (method->is_static()) {
1991 
1992       //  load oop into a register
1993       __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1994 
1995       // Now handlize the static class mirror it's known not-null.
1996       __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1997       map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1998 
1999       // Now get the handle
2000       __ lea(oop_handle_reg, Address(rsp, klass_offset));
2001       // store the klass handle as second argument
2002       __ movptr(c_rarg1, oop_handle_reg);
2003       // and protect the arg if we must spill
2004       c_arg--;
2005     }
2006   } else {
2007     // For JNI critical methods we need to save all registers in save_args.
2008     c_arg = 0;
2009   }
2010 
2011   // Change state to native (we save the return address in the thread, since it might not
2012   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2013   // points into the right code segment. It does not have to be the correct return pc.
2014   // We use the same pc/oopMap repeatedly when we call out
2015 
2016   intptr_t the_pc = (intptr_t) __ pc();
2017   oop_maps->add_gc_map(the_pc - start, map);
2018 
2019   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
2020 
2021 
2022   // We have all of the arguments setup at this point. We must not touch any register
2023   // argument registers at this point (what if we save/restore them there are no oop?
2024 
2025   {
2026     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2027     // protect the args we've loaded
2028     save_args(masm, total_c_args, c_arg, out_regs);
2029     __ mov_metadata(c_rarg1, method());
2030     __ call_VM_leaf(
2031       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2032       r15_thread, c_rarg1);
2033     restore_args(masm, total_c_args, c_arg, out_regs);
2034   }
2035 
2036   // RedefineClasses() tracing support for obsolete method entry
2037   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2038     // protect the args we've loaded
2039     save_args(masm, total_c_args, c_arg, out_regs);
2040     __ mov_metadata(c_rarg1, method());
2041     __ call_VM_leaf(
2042       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2043       r15_thread, c_rarg1);
2044     restore_args(masm, total_c_args, c_arg, out_regs);
2045   }
2046 
2047   // Lock a synchronized method
2048 
2049   // Register definitions used by locking and unlocking
2050 
2051   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2052   const Register obj_reg  = rbx;  // Will contain the oop
2053   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2054   const Register old_hdr  = r13;  // value of old header at unlock time
2055 
2056   Label slow_path_lock;
2057   Label lock_done;
2058 
2059   if (method->is_synchronized()) {
2060     assert(!is_critical_native, "unhandled");
2061 
2062 
2063     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2064 
2065     // Get the handle (the 2nd argument)
2066     __ mov(oop_handle_reg, c_rarg1);
2067 
2068     // Get address of the box
2069 
2070     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2071 
2072     // Load the oop from the handle
2073     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2074 
2075     if (LockingMode == LM_MONITOR) {
2076       __ jmp(slow_path_lock);
2077     } else if (LockingMode == LM_LEGACY) {
2078       if (UseBiasedLocking) {
2079         __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, rscratch2, false, lock_done, &slow_path_lock);
2080       }
2081 
2082       // Load immediate 1 into swap_reg %rax
2083       __ movl(swap_reg, 1);
2084 
2085       // Load (object->mark() | 1) into swap_reg %rax
2086       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2087 
2088       // Save (object->mark() | 1) into BasicLock's displaced header
2089       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2090 
2091       // src -> dest iff dest == rax else rax <- dest
2092       __ lock();
2093       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2094       __ jcc(Assembler::equal, lock_done);
2095 
2096       // Hmm should this move to the slow path code area???
2097 
2098       // Test if the oopMark is an obvious stack pointer, i.e.,
2099       //  1) (mark & 3) == 0, and
2100       //  2) rsp <= mark < mark + os::pagesize()
2101       // These 3 tests can be done by evaluating the following
2102       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2103       // assuming both stack pointer and pagesize have their
2104       // least significant 2 bits clear.
2105       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2106 
2107       __ subptr(swap_reg, rsp);
2108       __ andptr(swap_reg, 3 - os::vm_page_size());
2109 
2110       // Save the test result, for recursive case, the result is zero
2111       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2112       __ jcc(Assembler::notEqual, slow_path_lock);
2113     } else {
2114       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2115       // Load object header
2116       __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2117       __ fast_lock_impl(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2118     }
2119 
2120     // Slow path will re-enter here
2121 
2122     __ bind(lock_done);
2123   }
2124 
2125   // Finally just about ready to make the JNI call
2126 
2127   // get JNIEnv* which is first argument to native
2128   if (!is_critical_native) {
2129     __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2130 
2131     // Now set thread in native
2132     __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2133   }
2134 
2135   __ call(RuntimeAddress(native_func));
2136 
2137   // Verify or restore cpu control state after JNI call
2138   __ restore_cpu_control_state_after_jni();
2139 
2140   // Unpack native results.
2141   switch (ret_type) {
2142   case T_BOOLEAN: __ c2bool(rax);            break;
2143   case T_CHAR   : __ movzwl(rax, rax);      break;
2144   case T_BYTE   : __ sign_extend_byte (rax); break;
2145   case T_SHORT  : __ sign_extend_short(rax); break;
2146   case T_INT    : /* nothing to do */        break;
2147   case T_DOUBLE :
2148   case T_FLOAT  :
2149     // Result is in xmm0 we'll save as needed
2150     break;
2151   case T_ARRAY:                 // Really a handle
2152   case T_OBJECT:                // Really a handle
2153       break; // can't de-handlize until after safepoint check
2154   case T_VOID: break;
2155   case T_LONG: break;
2156   default       : ShouldNotReachHere();
2157   }
2158 
2159   Label after_transition;
2160 
2161   // If this is a critical native, check for a safepoint or suspend request after the call.
2162   // If a safepoint is needed, transition to native, then to native_trans to handle
2163   // safepoints like the native methods that are not critical natives.
2164   if (is_critical_native) {
2165     Label needs_safepoint;
2166     __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */);
2167     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2168     __ jcc(Assembler::equal, after_transition);
2169     __ bind(needs_safepoint);
2170   }
2171 
2172   // Switch thread to "native transition" state before reading the synchronization state.
2173   // This additional state is necessary because reading and testing the synchronization
2174   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2175   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2176   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2177   //     Thread A is resumed to finish this native method, but doesn't block here since it
2178   //     didn't see any synchronization is progress, and escapes.
2179   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2180 
2181   // Force this write out before the read below
2182   __ membar(Assembler::Membar_mask_bits(
2183               Assembler::LoadLoad | Assembler::LoadStore |
2184               Assembler::StoreLoad | Assembler::StoreStore));
2185 
2186   // check for safepoint operation in progress and/or pending suspend requests
2187   {
2188     Label Continue;
2189     Label slow_path;
2190 
2191     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2192 
2193     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2194     __ jcc(Assembler::equal, Continue);
2195     __ bind(slow_path);
2196 
2197     // Don't use call_VM as it will see a possible pending exception and forward it
2198     // and never return here preventing us from clearing _last_native_pc down below.
2199     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2200     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2201     // by hand.
2202     //
2203     __ vzeroupper();
2204     save_native_result(masm, ret_type, stack_slots);
2205     __ mov(c_rarg0, r15_thread);
2206     __ mov(r12, rsp); // remember sp
2207     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2208     __ andptr(rsp, -16); // align stack as required by ABI
2209     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2210     __ mov(rsp, r12); // restore sp
2211     __ reinit_heapbase();
2212     // Restore any method result value
2213     restore_native_result(masm, ret_type, stack_slots);
2214     __ bind(Continue);
2215   }
2216 
2217   // change thread state
2218   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2219   __ bind(after_transition);
2220 
2221   Label reguard;
2222   Label reguard_done;
2223   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2224   __ jcc(Assembler::equal, reguard);
2225   __ bind(reguard_done);
2226 
2227   // native result if any is live
2228 
2229   // Unlock
2230   Label unlock_done;
2231   Label slow_path_unlock;
2232   if (method->is_synchronized()) {
2233 
2234     // Get locked oop from the handle we passed to jni
2235     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2236 
2237     Label done;
2238 
2239     if (UseBiasedLocking) {
2240       __ biased_locking_exit(obj_reg, old_hdr, done);
2241     }
2242 
2243     if (LockingMode == LM_LEGACY) {
2244       // Simple recursive lock?
2245 
2246       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2247       __ jcc(Assembler::equal, done);
2248     }
2249 
2250     // Must save rax if if it is live now because cmpxchg must use it
2251     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2252       save_native_result(masm, ret_type, stack_slots);
2253     }
2254 
2255     if (LockingMode == LM_MONITOR) {
2256       __ jmp(slow_path_unlock);
2257     } else if (LockingMode == LM_LEGACY) {
2258       // get address of the stack lock
2259       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2260       //  get old displaced header
2261       __ movptr(old_hdr, Address(rax, 0));
2262 
2263       // Atomic swap old header if oop still contains the stack lock
2264       __ lock();
2265       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2266       __ jcc(Assembler::notEqual, slow_path_unlock);
2267     } else {
2268       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2269       __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2270       __ andptr(swap_reg, ~(int32_t)markWord::lock_mask_in_place);
2271       __ fast_unlock_impl(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2272     }
2273 
2274     // slow path re-enters here
2275     __ bind(unlock_done);
2276     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2277       restore_native_result(masm, ret_type, stack_slots);
2278     }
2279 
2280     __ bind(done);
2281 
2282   }
2283   {
2284     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2285     save_native_result(masm, ret_type, stack_slots);
2286     __ mov_metadata(c_rarg1, method());
2287     __ call_VM_leaf(
2288          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2289          r15_thread, c_rarg1);
2290     restore_native_result(masm, ret_type, stack_slots);
2291   }
2292 
2293   __ reset_last_Java_frame(false);
2294 
2295   // Unbox oop result, e.g. JNIHandles::resolve value.
2296   if (is_reference_type(ret_type)) {
2297     __ resolve_jobject(rax /* value */,
2298                        r15_thread /* thread */,
2299                        rcx /* tmp */);
2300   }
2301 
2302   if (CheckJNICalls) {
2303     // clear_pending_jni_exception_check
2304     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2305   }
2306 
2307   if (!is_critical_native) {
2308     // reset handle block
2309     __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2310     __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2311   }
2312 
2313   // pop our frame
2314 
2315   __ leave();
2316 
2317   if (!is_critical_native) {
2318     // Any exception pending?
2319     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2320     __ jcc(Assembler::notEqual, exception_pending);
2321   }
2322 
2323   // Return
2324 
2325   __ ret(0);
2326 
2327   // Unexpected paths are out of line and go here
2328 
2329   if (!is_critical_native) {
2330     // forward the exception
2331     __ bind(exception_pending);
2332 
2333     // and forward the exception
2334     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2335   }
2336 
2337   // Slow path locking & unlocking
2338   if (method->is_synchronized()) {
2339 
2340     // BEGIN Slow path lock
2341     __ bind(slow_path_lock);
2342 
2343     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2344     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2345 
2346     // protect the args we've loaded
2347     save_args(masm, total_c_args, c_arg, out_regs);
2348 
2349     __ mov(c_rarg0, obj_reg);
2350     __ mov(c_rarg1, lock_reg);
2351     __ mov(c_rarg2, r15_thread);
2352 
2353     // Not a leaf but we have last_Java_frame setup as we want
2354     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2355     restore_args(masm, total_c_args, c_arg, out_regs);
2356 
2357 #ifdef ASSERT
2358     { Label L;
2359     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2360     __ jcc(Assembler::equal, L);
2361     __ stop("no pending exception allowed on exit from monitorenter");
2362     __ bind(L);
2363     }
2364 #endif
2365     __ jmp(lock_done);
2366 
2367     // END Slow path lock
2368 
2369     // BEGIN Slow path unlock
2370     __ bind(slow_path_unlock);
2371 
2372     // If we haven't already saved the native result we must save it now as xmm registers
2373     // are still exposed.
2374     __ vzeroupper();
2375     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2376       save_native_result(masm, ret_type, stack_slots);
2377     }
2378 
2379     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2380 
2381     __ mov(c_rarg0, obj_reg);
2382     __ mov(c_rarg2, r15_thread);
2383     __ mov(r12, rsp); // remember sp
2384     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2385     __ andptr(rsp, -16); // align stack as required by ABI
2386 
2387     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2388     // NOTE that obj_reg == rbx currently
2389     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2390     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2391 
2392     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2393     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2394     __ mov(rsp, r12); // restore sp
2395     __ reinit_heapbase();
2396 #ifdef ASSERT
2397     {
2398       Label L;
2399       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2400       __ jcc(Assembler::equal, L);
2401       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2402       __ bind(L);
2403     }
2404 #endif /* ASSERT */
2405 
2406     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2407 
2408     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2409       restore_native_result(masm, ret_type, stack_slots);
2410     }
2411     __ jmp(unlock_done);
2412 
2413     // END Slow path unlock
2414 
2415   } // synchronized
2416 
2417   // SLOW PATH Reguard the stack if needed
2418 
2419   __ bind(reguard);
2420   __ vzeroupper();
2421   save_native_result(masm, ret_type, stack_slots);
2422   __ mov(r12, rsp); // remember sp
2423   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2424   __ andptr(rsp, -16); // align stack as required by ABI
2425   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2426   __ mov(rsp, r12); // restore sp
2427   __ reinit_heapbase();
2428   restore_native_result(masm, ret_type, stack_slots);
2429   // and continue
2430   __ jmp(reguard_done);
2431 
2432 
2433 
2434   __ flush();
2435 
2436   nmethod *nm = nmethod::new_native_nmethod(method,
2437                                             compile_id,
2438                                             masm->code(),
2439                                             vep_offset,
2440                                             frame_complete,
2441                                             stack_slots / VMRegImpl::slots_per_word,
2442                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2443                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2444                                             oop_maps);
2445 
2446   return nm;
2447 }
2448 
2449 // this function returns the adjust size (in number of words) to a c2i adapter
2450 // activation for use during deoptimization
2451 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2452   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2453 }
2454 
2455 
2456 uint SharedRuntime::out_preserve_stack_slots() {
2457   return 0;
2458 }
2459 
2460 
2461 // Number of stack slots between incoming argument block and the start of
2462 // a new frame.  The PROLOG must add this many slots to the stack.  The
2463 // EPILOG must remove this many slots.  amd64 needs two slots for
2464 // return address.
2465 uint SharedRuntime::in_preserve_stack_slots() {
2466   return 4 + 2 * VerifyStackAtCalls;
2467 }
2468 
2469 //------------------------------generate_deopt_blob----------------------------
2470 void SharedRuntime::generate_deopt_blob() {
2471   // Allocate space for the code
2472   ResourceMark rm;
2473   // Setup code generation tools
2474   int pad = 0;
2475   if (UseAVX > 2) {
2476     pad += 1024;
2477   }
2478 #if INCLUDE_JVMCI
2479   if (EnableJVMCI) {
2480     pad += 512; // Increase the buffer size when compiling for JVMCI
2481   }
2482 #endif
2483   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2484   MacroAssembler* masm = new MacroAssembler(&buffer);
2485   int frame_size_in_words;
2486   OopMap* map = NULL;
2487   OopMapSet *oop_maps = new OopMapSet();
2488 
2489   // -------------
2490   // This code enters when returning to a de-optimized nmethod.  A return
2491   // address has been pushed on the the stack, and return values are in
2492   // registers.
2493   // If we are doing a normal deopt then we were called from the patched
2494   // nmethod from the point we returned to the nmethod. So the return
2495   // address on the stack is wrong by NativeCall::instruction_size
2496   // We will adjust the value so it looks like we have the original return
2497   // address on the stack (like when we eagerly deoptimized).
2498   // In the case of an exception pending when deoptimizing, we enter
2499   // with a return address on the stack that points after the call we patched
2500   // into the exception handler. We have the following register state from,
2501   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2502   //    rax: exception oop
2503   //    rbx: exception handler
2504   //    rdx: throwing pc
2505   // So in this case we simply jam rdx into the useless return address and
2506   // the stack looks just like we want.
2507   //
2508   // At this point we need to de-opt.  We save the argument return
2509   // registers.  We call the first C routine, fetch_unroll_info().  This
2510   // routine captures the return values and returns a structure which
2511   // describes the current frame size and the sizes of all replacement frames.
2512   // The current frame is compiled code and may contain many inlined
2513   // functions, each with their own JVM state.  We pop the current frame, then
2514   // push all the new frames.  Then we call the C routine unpack_frames() to
2515   // populate these frames.  Finally unpack_frames() returns us the new target
2516   // address.  Notice that callee-save registers are BLOWN here; they have
2517   // already been captured in the vframeArray at the time the return PC was
2518   // patched.
2519   address start = __ pc();
2520   Label cont;
2521 
2522   // Prolog for non exception case!
2523 
2524   // Save everything in sight.
2525   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2526 
2527   // Normal deoptimization.  Save exec mode for unpack_frames.
2528   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2529   __ jmp(cont);
2530 
2531   int reexecute_offset = __ pc() - start;
2532 #if INCLUDE_JVMCI && !defined(COMPILER1)
2533   if (EnableJVMCI && UseJVMCICompiler) {
2534     // JVMCI does not use this kind of deoptimization
2535     __ should_not_reach_here();
2536   }
2537 #endif
2538 
2539   // Reexecute case
2540   // return address is the pc describes what bci to do re-execute at
2541 
2542   // No need to update map as each call to save_live_registers will produce identical oopmap
2543   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2544 
2545   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2546   __ jmp(cont);
2547 
2548 #if INCLUDE_JVMCI
2549   Label after_fetch_unroll_info_call;
2550   int implicit_exception_uncommon_trap_offset = 0;
2551   int uncommon_trap_offset = 0;
2552 
2553   if (EnableJVMCI) {
2554     implicit_exception_uncommon_trap_offset = __ pc() - start;
2555 
2556     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2557     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2558 
2559     uncommon_trap_offset = __ pc() - start;
2560 
2561     // Save everything in sight.
2562     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2563     // fetch_unroll_info needs to call last_java_frame()
2564     __ set_last_Java_frame(noreg, noreg, NULL);
2565 
2566     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2567     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2568 
2569     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2570     __ mov(c_rarg0, r15_thread);
2571     __ movl(c_rarg2, r14); // exec mode
2572     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2573     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2574 
2575     __ reset_last_Java_frame(false);
2576 
2577     __ jmp(after_fetch_unroll_info_call);
2578   } // EnableJVMCI
2579 #endif // INCLUDE_JVMCI
2580 
2581   int exception_offset = __ pc() - start;
2582 
2583   // Prolog for exception case
2584 
2585   // all registers are dead at this entry point, except for rax, and
2586   // rdx which contain the exception oop and exception pc
2587   // respectively.  Set them in TLS and fall thru to the
2588   // unpack_with_exception_in_tls entry point.
2589 
2590   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2591   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2592 
2593   int exception_in_tls_offset = __ pc() - start;
2594 
2595   // new implementation because exception oop is now passed in JavaThread
2596 
2597   // Prolog for exception case
2598   // All registers must be preserved because they might be used by LinearScan
2599   // Exceptiop oop and throwing PC are passed in JavaThread
2600   // tos: stack at point of call to method that threw the exception (i.e. only
2601   // args are on the stack, no return address)
2602 
2603   // make room on stack for the return address
2604   // It will be patched later with the throwing pc. The correct value is not
2605   // available now because loading it from memory would destroy registers.
2606   __ push(0);
2607 
2608   // Save everything in sight.
2609   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2610 
2611   // Now it is safe to overwrite any register
2612 
2613   // Deopt during an exception.  Save exec mode for unpack_frames.
2614   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2615 
2616   // load throwing pc from JavaThread and patch it as the return address
2617   // of the current frame. Then clear the field in JavaThread
2618 
2619   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2620   __ movptr(Address(rbp, wordSize), rdx);
2621   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2622 
2623 #ifdef ASSERT
2624   // verify that there is really an exception oop in JavaThread
2625   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2626   __ verify_oop(rax);
2627 
2628   // verify that there is no pending exception
2629   Label no_pending_exception;
2630   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2631   __ testptr(rax, rax);
2632   __ jcc(Assembler::zero, no_pending_exception);
2633   __ stop("must not have pending exception here");
2634   __ bind(no_pending_exception);
2635 #endif
2636 
2637   __ bind(cont);
2638 
2639   // Call C code.  Need thread and this frame, but NOT official VM entry
2640   // crud.  We cannot block on this call, no GC can happen.
2641   //
2642   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2643 
2644   // fetch_unroll_info needs to call last_java_frame().
2645 
2646   __ set_last_Java_frame(noreg, noreg, NULL);
2647 #ifdef ASSERT
2648   { Label L;
2649     __ cmpptr(Address(r15_thread,
2650                     JavaThread::last_Java_fp_offset()),
2651             (int32_t)0);
2652     __ jcc(Assembler::equal, L);
2653     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2654     __ bind(L);
2655   }
2656 #endif // ASSERT
2657   __ mov(c_rarg0, r15_thread);
2658   __ movl(c_rarg1, r14); // exec_mode
2659   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2660 
2661   // Need to have an oopmap that tells fetch_unroll_info where to
2662   // find any register it might need.
2663   oop_maps->add_gc_map(__ pc() - start, map);
2664 
2665   __ reset_last_Java_frame(false);
2666 
2667 #if INCLUDE_JVMCI
2668   if (EnableJVMCI) {
2669     __ bind(after_fetch_unroll_info_call);
2670   }
2671 #endif
2672 
2673   // Load UnrollBlock* into rdi
2674   __ mov(rdi, rax);
2675 
2676   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2677    Label noException;
2678   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2679   __ jcc(Assembler::notEqual, noException);
2680   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2681   // QQQ this is useless it was NULL above
2682   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2683   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2684   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2685 
2686   __ verify_oop(rax);
2687 
2688   // Overwrite the result registers with the exception results.
2689   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2690   // I think this is useless
2691   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2692 
2693   __ bind(noException);
2694 
2695   // Only register save data is on the stack.
2696   // Now restore the result registers.  Everything else is either dead
2697   // or captured in the vframeArray.
2698   RegisterSaver::restore_result_registers(masm);
2699 
2700   // All of the register save area has been popped of the stack. Only the
2701   // return address remains.
2702 
2703   // Pop all the frames we must move/replace.
2704   //
2705   // Frame picture (youngest to oldest)
2706   // 1: self-frame (no frame link)
2707   // 2: deopting frame  (no frame link)
2708   // 3: caller of deopting frame (could be compiled/interpreted).
2709   //
2710   // Note: by leaving the return address of self-frame on the stack
2711   // and using the size of frame 2 to adjust the stack
2712   // when we are done the return to frame 3 will still be on the stack.
2713 
2714   // Pop deoptimized frame
2715   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2716   __ addptr(rsp, rcx);
2717 
2718   // rsp should be pointing at the return address to the caller (3)
2719 
2720   // Pick up the initial fp we should save
2721   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2722   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2723 
2724 #ifdef ASSERT
2725   // Compilers generate code that bang the stack by as much as the
2726   // interpreter would need. So this stack banging should never
2727   // trigger a fault. Verify that it does not on non product builds.
2728   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2729   __ bang_stack_size(rbx, rcx);
2730 #endif
2731 
2732   // Load address of array of frame pcs into rcx
2733   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2734 
2735   // Trash the old pc
2736   __ addptr(rsp, wordSize);
2737 
2738   // Load address of array of frame sizes into rsi
2739   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2740 
2741   // Load counter into rdx
2742   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2743 
2744   // Now adjust the caller's stack to make up for the extra locals
2745   // but record the original sp so that we can save it in the skeletal interpreter
2746   // frame and the stack walking of interpreter_sender will get the unextended sp
2747   // value and not the "real" sp value.
2748 
2749   const Register sender_sp = r8;
2750 
2751   __ mov(sender_sp, rsp);
2752   __ movl(rbx, Address(rdi,
2753                        Deoptimization::UnrollBlock::
2754                        caller_adjustment_offset_in_bytes()));
2755   __ subptr(rsp, rbx);
2756 
2757   // Push interpreter frames in a loop
2758   Label loop;
2759   __ bind(loop);
2760   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2761   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2762   __ pushptr(Address(rcx, 0));          // Save return address
2763   __ enter();                           // Save old & set new ebp
2764   __ subptr(rsp, rbx);                  // Prolog
2765   // This value is corrected by layout_activation_impl
2766   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2767   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2768   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2769   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2770   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2771   __ decrementl(rdx);                   // Decrement counter
2772   __ jcc(Assembler::notZero, loop);
2773   __ pushptr(Address(rcx, 0));          // Save final return address
2774 
2775   // Re-push self-frame
2776   __ enter();                           // Save old & set new ebp
2777 
2778   // Allocate a full sized register save area.
2779   // Return address and rbp are in place, so we allocate two less words.
2780   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2781 
2782   // Restore frame locals after moving the frame
2783   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2784   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2785 
2786   // Call C code.  Need thread but NOT official VM entry
2787   // crud.  We cannot block on this call, no GC can happen.  Call should
2788   // restore return values to their stack-slots with the new SP.
2789   //
2790   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2791 
2792   // Use rbp because the frames look interpreted now
2793   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2794   // Don't need the precise return PC here, just precise enough to point into this code blob.
2795   address the_pc = __ pc();
2796   __ set_last_Java_frame(noreg, rbp, the_pc);
2797 
2798   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2799   __ mov(c_rarg0, r15_thread);
2800   __ movl(c_rarg1, r14); // second arg: exec_mode
2801   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2802   // Revert SP alignment after call since we're going to do some SP relative addressing below
2803   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2804 
2805   // Set an oopmap for the call site
2806   // Use the same PC we used for the last java frame
2807   oop_maps->add_gc_map(the_pc - start,
2808                        new OopMap( frame_size_in_words, 0 ));
2809 
2810   // Clear fp AND pc
2811   __ reset_last_Java_frame(true);
2812 
2813   // Collect return values
2814   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2815   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2816   // I think this is useless (throwing pc?)
2817   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2818 
2819   // Pop self-frame.
2820   __ leave();                           // Epilog
2821 
2822   // Jump to interpreter
2823   __ ret(0);
2824 
2825   // Make sure all code is generated
2826   masm->flush();
2827 
2828   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2829   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2830 #if INCLUDE_JVMCI
2831   if (EnableJVMCI) {
2832     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2833     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2834   }
2835 #endif
2836 }
2837 
2838 #ifdef COMPILER2
2839 //------------------------------generate_uncommon_trap_blob--------------------
2840 void SharedRuntime::generate_uncommon_trap_blob() {
2841   // Allocate space for the code
2842   ResourceMark rm;
2843   // Setup code generation tools
2844   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2845   MacroAssembler* masm = new MacroAssembler(&buffer);
2846 
2847   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2848 
2849   address start = __ pc();
2850 
2851   if (UseRTMLocking) {
2852     // Abort RTM transaction before possible nmethod deoptimization.
2853     __ xabort(0);
2854   }
2855 
2856   // Push self-frame.  We get here with a return address on the
2857   // stack, so rsp is 8-byte aligned until we allocate our frame.
2858   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2859 
2860   // No callee saved registers. rbp is assumed implicitly saved
2861   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2862 
2863   // compiler left unloaded_class_index in j_rarg0 move to where the
2864   // runtime expects it.
2865   __ movl(c_rarg1, j_rarg0);
2866 
2867   __ set_last_Java_frame(noreg, noreg, NULL);
2868 
2869   // Call C code.  Need thread but NOT official VM entry
2870   // crud.  We cannot block on this call, no GC can happen.  Call should
2871   // capture callee-saved registers as well as return values.
2872   // Thread is in rdi already.
2873   //
2874   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2875 
2876   __ mov(c_rarg0, r15_thread);
2877   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2878   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2879 
2880   // Set an oopmap for the call site
2881   OopMapSet* oop_maps = new OopMapSet();
2882   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2883 
2884   // location of rbp is known implicitly by the frame sender code
2885 
2886   oop_maps->add_gc_map(__ pc() - start, map);
2887 
2888   __ reset_last_Java_frame(false);
2889 
2890   // Load UnrollBlock* into rdi
2891   __ mov(rdi, rax);
2892 
2893 #ifdef ASSERT
2894   { Label L;
2895     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2896             (int32_t)Deoptimization::Unpack_uncommon_trap);
2897     __ jcc(Assembler::equal, L);
2898     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2899     __ bind(L);
2900   }
2901 #endif
2902 
2903   // Pop all the frames we must move/replace.
2904   //
2905   // Frame picture (youngest to oldest)
2906   // 1: self-frame (no frame link)
2907   // 2: deopting frame  (no frame link)
2908   // 3: caller of deopting frame (could be compiled/interpreted).
2909 
2910   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2911   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2912 
2913   // Pop deoptimized frame (int)
2914   __ movl(rcx, Address(rdi,
2915                        Deoptimization::UnrollBlock::
2916                        size_of_deoptimized_frame_offset_in_bytes()));
2917   __ addptr(rsp, rcx);
2918 
2919   // rsp should be pointing at the return address to the caller (3)
2920 
2921   // Pick up the initial fp we should save
2922   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2923   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2924 
2925 #ifdef ASSERT
2926   // Compilers generate code that bang the stack by as much as the
2927   // interpreter would need. So this stack banging should never
2928   // trigger a fault. Verify that it does not on non product builds.
2929   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2930   __ bang_stack_size(rbx, rcx);
2931 #endif
2932 
2933   // Load address of array of frame pcs into rcx (address*)
2934   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2935 
2936   // Trash the return pc
2937   __ addptr(rsp, wordSize);
2938 
2939   // Load address of array of frame sizes into rsi (intptr_t*)
2940   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2941 
2942   // Counter
2943   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2944 
2945   // Now adjust the caller's stack to make up for the extra locals but
2946   // record the original sp so that we can save it in the skeletal
2947   // interpreter frame and the stack walking of interpreter_sender
2948   // will get the unextended sp value and not the "real" sp value.
2949 
2950   const Register sender_sp = r8;
2951 
2952   __ mov(sender_sp, rsp);
2953   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2954   __ subptr(rsp, rbx);
2955 
2956   // Push interpreter frames in a loop
2957   Label loop;
2958   __ bind(loop);
2959   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2960   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2961   __ pushptr(Address(rcx, 0));     // Save return address
2962   __ enter();                      // Save old & set new rbp
2963   __ subptr(rsp, rbx);             // Prolog
2964   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2965             sender_sp);            // Make it walkable
2966   // This value is corrected by layout_activation_impl
2967   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2968   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
2969   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
2970   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
2971   __ decrementl(rdx);              // Decrement counter
2972   __ jcc(Assembler::notZero, loop);
2973   __ pushptr(Address(rcx, 0));     // Save final return address
2974 
2975   // Re-push self-frame
2976   __ enter();                 // Save old & set new rbp
2977   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2978                               // Prolog
2979 
2980   // Use rbp because the frames look interpreted now
2981   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2982   // Don't need the precise return PC here, just precise enough to point into this code blob.
2983   address the_pc = __ pc();
2984   __ set_last_Java_frame(noreg, rbp, the_pc);
2985 
2986   // Call C code.  Need thread but NOT official VM entry
2987   // crud.  We cannot block on this call, no GC can happen.  Call should
2988   // restore return values to their stack-slots with the new SP.
2989   // Thread is in rdi already.
2990   //
2991   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2992 
2993   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2994   __ mov(c_rarg0, r15_thread);
2995   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2996   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2997 
2998   // Set an oopmap for the call site
2999   // Use the same PC we used for the last java frame
3000   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3001 
3002   // Clear fp AND pc
3003   __ reset_last_Java_frame(true);
3004 
3005   // Pop self-frame.
3006   __ leave();                 // Epilog
3007 
3008   // Jump to interpreter
3009   __ ret(0);
3010 
3011   // Make sure all code is generated
3012   masm->flush();
3013 
3014   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3015                                                  SimpleRuntimeFrame::framesize >> 1);
3016 }
3017 #endif // COMPILER2
3018 
3019 //------------------------------generate_handler_blob------
3020 //
3021 // Generate a special Compile2Runtime blob that saves all registers,
3022 // and setup oopmap.
3023 //
3024 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3025   assert(StubRoutines::forward_exception_entry() != NULL,
3026          "must be generated before");
3027 
3028   ResourceMark rm;
3029   OopMapSet *oop_maps = new OopMapSet();
3030   OopMap* map;
3031 
3032   // Allocate space for the code.  Setup code generation tools.
3033   CodeBuffer buffer("handler_blob", 2048, 1024);
3034   MacroAssembler* masm = new MacroAssembler(&buffer);
3035 
3036   address start   = __ pc();
3037   address call_pc = NULL;
3038   int frame_size_in_words;
3039   bool cause_return = (poll_type == POLL_AT_RETURN);
3040   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3041 
3042   if (UseRTMLocking) {
3043     // Abort RTM transaction before calling runtime
3044     // because critical section will be large and will be
3045     // aborted anyway. Also nmethod could be deoptimized.
3046     __ xabort(0);
3047   }
3048 
3049   // Make room for return address (or push it again)
3050   if (!cause_return) {
3051     __ push(rbx);
3052   }
3053 
3054   // Save registers, fpu state, and flags
3055   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3056 
3057   // The following is basically a call_VM.  However, we need the precise
3058   // address of the call in order to generate an oopmap. Hence, we do all the
3059   // work outselves.
3060 
3061   __ set_last_Java_frame(noreg, noreg, NULL);
3062 
3063   // The return address must always be correct so that frame constructor never
3064   // sees an invalid pc.
3065 
3066   if (!cause_return) {
3067     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3068     // Additionally, rbx is a callee saved register and we can look at it later to determine
3069     // if someone changed the return address for us!
3070     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3071     __ movptr(Address(rbp, wordSize), rbx);
3072   }
3073 
3074   // Do the call
3075   __ mov(c_rarg0, r15_thread);
3076   __ call(RuntimeAddress(call_ptr));
3077 
3078   // Set an oopmap for the call site.  This oopmap will map all
3079   // oop-registers and debug-info registers as callee-saved.  This
3080   // will allow deoptimization at this safepoint to find all possible
3081   // debug-info recordings, as well as let GC find all oops.
3082 
3083   oop_maps->add_gc_map( __ pc() - start, map);
3084 
3085   Label noException;
3086 
3087   __ reset_last_Java_frame(false);
3088 
3089   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3090   __ jcc(Assembler::equal, noException);
3091 
3092   // Exception pending
3093 
3094   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3095 
3096   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3097 
3098   // No exception case
3099   __ bind(noException);
3100 
3101   Label no_adjust;
3102 #ifdef ASSERT
3103   Label bail;
3104 #endif
3105   if (!cause_return) {
3106     Label no_prefix, not_special;
3107 
3108     // If our stashed return pc was modified by the runtime we avoid touching it
3109     __ cmpptr(rbx, Address(rbp, wordSize));
3110     __ jccb(Assembler::notEqual, no_adjust);
3111 
3112     // Skip over the poll instruction.
3113     // See NativeInstruction::is_safepoint_poll()
3114     // Possible encodings:
3115     //      85 00       test   %eax,(%rax)
3116     //      85 01       test   %eax,(%rcx)
3117     //      85 02       test   %eax,(%rdx)
3118     //      85 03       test   %eax,(%rbx)
3119     //      85 06       test   %eax,(%rsi)
3120     //      85 07       test   %eax,(%rdi)
3121     //
3122     //   41 85 00       test   %eax,(%r8)
3123     //   41 85 01       test   %eax,(%r9)
3124     //   41 85 02       test   %eax,(%r10)
3125     //   41 85 03       test   %eax,(%r11)
3126     //   41 85 06       test   %eax,(%r14)
3127     //   41 85 07       test   %eax,(%r15)
3128     //
3129     //      85 04 24    test   %eax,(%rsp)
3130     //   41 85 04 24    test   %eax,(%r12)
3131     //      85 45 00    test   %eax,0x0(%rbp)
3132     //   41 85 45 00    test   %eax,0x0(%r13)
3133 
3134     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3135     __ jcc(Assembler::notEqual, no_prefix);
3136     __ addptr(rbx, 1);
3137     __ bind(no_prefix);
3138 #ifdef ASSERT
3139     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3140 #endif
3141     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3142     // r12/rsp 0x04
3143     // r13/rbp 0x05
3144     __ movzbq(rcx, Address(rbx, 1));
3145     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3146     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3147     __ cmpptr(rcx, 1);
3148     __ jcc(Assembler::above, not_special);
3149     __ addptr(rbx, 1);
3150     __ bind(not_special);
3151 #ifdef ASSERT
3152     // Verify the correct encoding of the poll we're about to skip.
3153     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3154     __ jcc(Assembler::notEqual, bail);
3155     // Mask out the modrm bits
3156     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3157     // rax encodes to 0, so if the bits are nonzero it's incorrect
3158     __ jcc(Assembler::notZero, bail);
3159 #endif
3160     // Adjust return pc forward to step over the safepoint poll instruction
3161     __ addptr(rbx, 2);
3162     __ movptr(Address(rbp, wordSize), rbx);
3163   }
3164 
3165   __ bind(no_adjust);
3166   // Normal exit, restore registers and exit.
3167   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3168   __ ret(0);
3169 
3170 #ifdef ASSERT
3171   __ bind(bail);
3172   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3173 #endif
3174 
3175   // Make sure all code is generated
3176   masm->flush();
3177 
3178   // Fill-out other meta info
3179   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3180 }
3181 
3182 //
3183 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3184 //
3185 // Generate a stub that calls into vm to find out the proper destination
3186 // of a java call. All the argument registers are live at this point
3187 // but since this is generic code we don't know what they are and the caller
3188 // must do any gc of the args.
3189 //
3190 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3191   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3192 
3193   // allocate space for the code
3194   ResourceMark rm;
3195 
3196   CodeBuffer buffer(name, 1000, 512);
3197   MacroAssembler* masm                = new MacroAssembler(&buffer);
3198 
3199   int frame_size_in_words;
3200 
3201   OopMapSet *oop_maps = new OopMapSet();
3202   OopMap* map = NULL;
3203 
3204   int start = __ offset();
3205 
3206   // No need to save vector registers since they are caller-saved anyway.
3207   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3208 
3209   int frame_complete = __ offset();
3210 
3211   __ set_last_Java_frame(noreg, noreg, NULL);
3212 
3213   __ mov(c_rarg0, r15_thread);
3214 
3215   __ call(RuntimeAddress(destination));
3216 
3217 
3218   // Set an oopmap for the call site.
3219   // We need this not only for callee-saved registers, but also for volatile
3220   // registers that the compiler might be keeping live across a safepoint.
3221 
3222   oop_maps->add_gc_map( __ offset() - start, map);
3223 
3224   // rax contains the address we are going to jump to assuming no exception got installed
3225 
3226   // clear last_Java_sp
3227   __ reset_last_Java_frame(false);
3228   // check for pending exceptions
3229   Label pending;
3230   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3231   __ jcc(Assembler::notEqual, pending);
3232 
3233   // get the returned Method*
3234   __ get_vm_result_2(rbx, r15_thread);
3235   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3236 
3237   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3238 
3239   RegisterSaver::restore_live_registers(masm);
3240 
3241   // We are back the the original state on entry and ready to go.
3242 
3243   __ jmp(rax);
3244 
3245   // Pending exception after the safepoint
3246 
3247   __ bind(pending);
3248 
3249   RegisterSaver::restore_live_registers(masm);
3250 
3251   // exception pending => remove activation and forward to exception handler
3252 
3253   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3254 
3255   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3256   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3257 
3258   // -------------
3259   // make sure all code is generated
3260   masm->flush();
3261 
3262   // return the  blob
3263   // frame_size_words or bytes??
3264   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3265 }
3266 
3267 #ifdef COMPILER2
3268 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3269 
3270 class NativeInvokerGenerator : public StubCodeGenerator {
3271   address _call_target;
3272   int _shadow_space_bytes;
3273 
3274   const GrowableArray<VMReg>& _input_registers;
3275   const GrowableArray<VMReg>& _output_registers;
3276 
3277   int _frame_complete;
3278   int _framesize;
3279   OopMapSet* _oop_maps;
3280 public:
3281   NativeInvokerGenerator(CodeBuffer* buffer,
3282                          address call_target,
3283                          int shadow_space_bytes,
3284                          const GrowableArray<VMReg>& input_registers,
3285                          const GrowableArray<VMReg>& output_registers)
3286    : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3287      _call_target(call_target),
3288      _shadow_space_bytes(shadow_space_bytes),
3289      _input_registers(input_registers),
3290      _output_registers(output_registers),
3291      _frame_complete(0),
3292      _framesize(0),
3293      _oop_maps(NULL) {
3294     assert(_output_registers.length() <= 1
3295            || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3296 
3297   }
3298 
3299   void generate();
3300 
3301   int spill_size_in_bytes() const {
3302     if (_output_registers.length() == 0) {
3303       return 0;
3304     }
3305     VMReg reg = _output_registers.at(0);
3306     assert(reg->is_reg(), "must be a register");
3307     if (reg->is_Register()) {
3308       return 8;
3309     } else if (reg->is_XMMRegister()) {
3310       if (UseAVX >= 3) {
3311         return 64;
3312       } else if (UseAVX >= 1) {
3313         return 32;
3314       } else {
3315         return 16;
3316       }
3317     } else {
3318       ShouldNotReachHere();
3319     }
3320     return 0;
3321   }
3322 
3323   void spill_out_registers() {
3324     if (_output_registers.length() == 0) {
3325       return;
3326     }
3327     VMReg reg = _output_registers.at(0);
3328     assert(reg->is_reg(), "must be a register");
3329     MacroAssembler* masm = _masm;
3330     if (reg->is_Register()) {
3331       __ movptr(Address(rsp, 0), reg->as_Register());
3332     } else if (reg->is_XMMRegister()) {
3333       if (UseAVX >= 3) {
3334         __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3335       } else if (UseAVX >= 1) {
3336         __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3337       } else {
3338         __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3339       }
3340     } else {
3341       ShouldNotReachHere();
3342     }
3343   }
3344 
3345   void fill_out_registers() {
3346     if (_output_registers.length() == 0) {
3347       return;
3348     }
3349     VMReg reg = _output_registers.at(0);
3350     assert(reg->is_reg(), "must be a register");
3351     MacroAssembler* masm = _masm;
3352     if (reg->is_Register()) {
3353       __ movptr(reg->as_Register(), Address(rsp, 0));
3354     } else if (reg->is_XMMRegister()) {
3355       if (UseAVX >= 3) {
3356         __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3357       } else if (UseAVX >= 1) {
3358         __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3359       } else {
3360         __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3361       }
3362     } else {
3363       ShouldNotReachHere();
3364     }
3365   }
3366 
3367   int frame_complete() const {
3368     return _frame_complete;
3369   }
3370 
3371   int framesize() const {
3372     return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3373   }
3374 
3375   OopMapSet* oop_maps() const {
3376     return _oop_maps;
3377   }
3378 
3379 private:
3380 #ifdef ASSERT
3381 bool target_uses_register(VMReg reg) {
3382   return _input_registers.contains(reg) || _output_registers.contains(reg);
3383 }
3384 #endif
3385 };
3386 
3387 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3388                                                 int shadow_space_bytes,
3389                                                 const GrowableArray<VMReg>& input_registers,
3390                                                 const GrowableArray<VMReg>& output_registers) {
3391   int locs_size  = 64;
3392   CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3393   NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3394   g.generate();
3395   code.log_section_sizes("nep_invoker_blob");
3396 
3397   RuntimeStub* stub =
3398     RuntimeStub::new_runtime_stub("nep_invoker_blob",
3399                                   &code,
3400                                   g.frame_complete(),
3401                                   g.framesize(),
3402                                   g.oop_maps(), false);
3403   return stub;
3404 }
3405 
3406 void NativeInvokerGenerator::generate() {
3407   assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3408 
3409   enum layout {
3410     rbp_off,
3411     rbp_off2,
3412     return_off,
3413     return_off2,
3414     framesize // inclusive of return address
3415   };
3416 
3417   _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3418   assert(is_even(_framesize/2), "sp not 16-byte aligned");
3419 
3420   _oop_maps  = new OopMapSet();
3421   MacroAssembler* masm = _masm;
3422 
3423   address start = __ pc();
3424 
3425   __ enter();
3426 
3427   // return address and rbp are already in place
3428   __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3429 
3430   _frame_complete = __ pc() - start;
3431 
3432   address the_pc = __ pc();
3433 
3434   __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3435   OopMap* map = new OopMap(_framesize, 0);
3436   _oop_maps->add_gc_map(the_pc - start, map);
3437 
3438   // State transition
3439   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3440 
3441   __ call(RuntimeAddress(_call_target));
3442 
3443   __ restore_cpu_control_state_after_jni();
3444 
3445   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3446 
3447   // Force this write out before the read below
3448   __ membar(Assembler::Membar_mask_bits(
3449           Assembler::LoadLoad | Assembler::LoadStore |
3450           Assembler::StoreLoad | Assembler::StoreStore));
3451 
3452   Label L_after_safepoint_poll;
3453   Label L_safepoint_poll_slow_path;
3454 
3455   __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3456   __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3457   __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3458 
3459   __ bind(L_after_safepoint_poll);
3460 
3461   // change thread state
3462   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3463 
3464   __ block_comment("reguard stack check");
3465   Label L_reguard;
3466   Label L_after_reguard;
3467   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3468   __ jcc(Assembler::equal, L_reguard);
3469   __ bind(L_after_reguard);
3470 
3471   __ reset_last_Java_frame(r15_thread, true);
3472 
3473   __ leave(); // required for proper stackwalking of RuntimeStub frame
3474   __ ret(0);
3475 
3476   //////////////////////////////////////////////////////////////////////////////
3477 
3478   __ block_comment("{ L_safepoint_poll_slow_path");
3479   __ bind(L_safepoint_poll_slow_path);
3480   __ vzeroupper();
3481 
3482   spill_out_registers();
3483 
3484   __ mov(c_rarg0, r15_thread);
3485   __ mov(r12, rsp); // remember sp
3486   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3487   __ andptr(rsp, -16); // align stack as required by ABI
3488   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3489   __ mov(rsp, r12); // restore sp
3490   __ reinit_heapbase();
3491 
3492   fill_out_registers();
3493 
3494   __ jmp(L_after_safepoint_poll);
3495   __ block_comment("} L_safepoint_poll_slow_path");
3496 
3497   //////////////////////////////////////////////////////////////////////////////
3498 
3499   __ block_comment("{ L_reguard");
3500   __ bind(L_reguard);
3501   __ vzeroupper();
3502 
3503   spill_out_registers();
3504 
3505   __ mov(r12, rsp); // remember sp
3506   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3507   __ andptr(rsp, -16); // align stack as required by ABI
3508   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3509   __ mov(rsp, r12); // restore sp
3510   __ reinit_heapbase();
3511 
3512   fill_out_registers();
3513 
3514   __ jmp(L_after_reguard);
3515 
3516   __ block_comment("} L_reguard");
3517 
3518   //////////////////////////////////////////////////////////////////////////////
3519 
3520   __ flush();
3521 }
3522 #endif // COMPILER2
3523 
3524 //------------------------------Montgomery multiplication------------------------
3525 //
3526 
3527 #ifndef _WINDOWS
3528 
3529 // Subtract 0:b from carry:a.  Return carry.
3530 static julong
3531 sub(julong a[], julong b[], julong carry, long len) {
3532   long long i = 0, cnt = len;
3533   julong tmp;
3534   asm volatile("clc; "
3535                "0: ; "
3536                "mov (%[b], %[i], 8), %[tmp]; "
3537                "sbb %[tmp], (%[a], %[i], 8); "
3538                "inc %[i]; dec %[cnt]; "
3539                "jne 0b; "
3540                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3541                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3542                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3543                : "memory");
3544   return tmp;
3545 }
3546 
3547 // Multiply (unsigned) Long A by Long B, accumulating the double-
3548 // length result into the accumulator formed of T0, T1, and T2.
3549 #define MACC(A, B, T0, T1, T2)                                  \
3550 do {                                                            \
3551   unsigned long hi, lo;                                         \
3552   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3553            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3554            : "r"(A), "a"(B) : "cc");                            \
3555  } while(0)
3556 
3557 // As above, but add twice the double-length result into the
3558 // accumulator.
3559 #define MACC2(A, B, T0, T1, T2)                                 \
3560 do {                                                            \
3561   unsigned long hi, lo;                                         \
3562   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3563            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3564            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3565            : "r"(A), "a"(B) : "cc");                            \
3566  } while(0)
3567 
3568 #else //_WINDOWS
3569 
3570 static julong
3571 sub(julong a[], julong b[], julong carry, long len) {
3572   long i;
3573   julong tmp;
3574   unsigned char c = 1;
3575   for (i = 0; i < len; i++) {
3576     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3577     a[i] = tmp;
3578   }
3579   c = _addcarry_u64(c, carry, ~0, &tmp);
3580   return tmp;
3581 }
3582 
3583 // Multiply (unsigned) Long A by Long B, accumulating the double-
3584 // length result into the accumulator formed of T0, T1, and T2.
3585 #define MACC(A, B, T0, T1, T2)                          \
3586 do {                                                    \
3587   julong hi, lo;                            \
3588   lo = _umul128(A, B, &hi);                             \
3589   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3590   c = _addcarry_u64(c, hi, T1, &T1);                    \
3591   _addcarry_u64(c, T2, 0, &T2);                         \
3592  } while(0)
3593 
3594 // As above, but add twice the double-length result into the
3595 // accumulator.
3596 #define MACC2(A, B, T0, T1, T2)                         \
3597 do {                                                    \
3598   julong hi, lo;                            \
3599   lo = _umul128(A, B, &hi);                             \
3600   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3601   c = _addcarry_u64(c, hi, T1, &T1);                    \
3602   _addcarry_u64(c, T2, 0, &T2);                         \
3603   c = _addcarry_u64(0, lo, T0, &T0);                    \
3604   c = _addcarry_u64(c, hi, T1, &T1);                    \
3605   _addcarry_u64(c, T2, 0, &T2);                         \
3606  } while(0)
3607 
3608 #endif //_WINDOWS
3609 
3610 // Fast Montgomery multiplication.  The derivation of the algorithm is
3611 // in  A Cryptographic Library for the Motorola DSP56000,
3612 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3613 
3614 static void NOINLINE
3615 montgomery_multiply(julong a[], julong b[], julong n[],
3616                     julong m[], julong inv, int len) {
3617   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3618   int i;
3619 
3620   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3621 
3622   for (i = 0; i < len; i++) {
3623     int j;
3624     for (j = 0; j < i; j++) {
3625       MACC(a[j], b[i-j], t0, t1, t2);
3626       MACC(m[j], n[i-j], t0, t1, t2);
3627     }
3628     MACC(a[i], b[0], t0, t1, t2);
3629     m[i] = t0 * inv;
3630     MACC(m[i], n[0], t0, t1, t2);
3631 
3632     assert(t0 == 0, "broken Montgomery multiply");
3633 
3634     t0 = t1; t1 = t2; t2 = 0;
3635   }
3636 
3637   for (i = len; i < 2*len; i++) {
3638     int j;
3639     for (j = i-len+1; j < len; j++) {
3640       MACC(a[j], b[i-j], t0, t1, t2);
3641       MACC(m[j], n[i-j], t0, t1, t2);
3642     }
3643     m[i-len] = t0;
3644     t0 = t1; t1 = t2; t2 = 0;
3645   }
3646 
3647   while (t0)
3648     t0 = sub(m, n, t0, len);
3649 }
3650 
3651 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3652 // multiplies so it should be up to 25% faster than Montgomery
3653 // multiplication.  However, its loop control is more complex and it
3654 // may actually run slower on some machines.
3655 
3656 static void NOINLINE
3657 montgomery_square(julong a[], julong n[],
3658                   julong m[], julong inv, int len) {
3659   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3660   int i;
3661 
3662   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3663 
3664   for (i = 0; i < len; i++) {
3665     int j;
3666     int end = (i+1)/2;
3667     for (j = 0; j < end; j++) {
3668       MACC2(a[j], a[i-j], t0, t1, t2);
3669       MACC(m[j], n[i-j], t0, t1, t2);
3670     }
3671     if ((i & 1) == 0) {
3672       MACC(a[j], a[j], t0, t1, t2);
3673     }
3674     for (; j < i; j++) {
3675       MACC(m[j], n[i-j], t0, t1, t2);
3676     }
3677     m[i] = t0 * inv;
3678     MACC(m[i], n[0], t0, t1, t2);
3679 
3680     assert(t0 == 0, "broken Montgomery square");
3681 
3682     t0 = t1; t1 = t2; t2 = 0;
3683   }
3684 
3685   for (i = len; i < 2*len; i++) {
3686     int start = i-len+1;
3687     int end = start + (len - start)/2;
3688     int j;
3689     for (j = start; j < end; j++) {
3690       MACC2(a[j], a[i-j], t0, t1, t2);
3691       MACC(m[j], n[i-j], t0, t1, t2);
3692     }
3693     if ((i & 1) == 0) {
3694       MACC(a[j], a[j], t0, t1, t2);
3695     }
3696     for (; j < len; j++) {
3697       MACC(m[j], n[i-j], t0, t1, t2);
3698     }
3699     m[i-len] = t0;
3700     t0 = t1; t1 = t2; t2 = 0;
3701   }
3702 
3703   while (t0)
3704     t0 = sub(m, n, t0, len);
3705 }
3706 
3707 // Swap words in a longword.
3708 static julong swap(julong x) {
3709   return (x << 32) | (x >> 32);
3710 }
3711 
3712 // Copy len longwords from s to d, word-swapping as we go.  The
3713 // destination array is reversed.
3714 static void reverse_words(julong *s, julong *d, int len) {
3715   d += len;
3716   while(len-- > 0) {
3717     d--;
3718     *d = swap(*s);
3719     s++;
3720   }
3721 }
3722 
3723 // The threshold at which squaring is advantageous was determined
3724 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3725 #define MONTGOMERY_SQUARING_THRESHOLD 64
3726 
3727 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3728                                         jint len, jlong inv,
3729                                         jint *m_ints) {
3730   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3731   int longwords = len/2;
3732 
3733   // Make very sure we don't use so much space that the stack might
3734   // overflow.  512 jints corresponds to an 16384-bit integer and
3735   // will use here a total of 8k bytes of stack space.
3736   int total_allocation = longwords * sizeof (julong) * 4;
3737   guarantee(total_allocation <= 8192, "must be");
3738   julong *scratch = (julong *)alloca(total_allocation);
3739 
3740   // Local scratch arrays
3741   julong
3742     *a = scratch + 0 * longwords,
3743     *b = scratch + 1 * longwords,
3744     *n = scratch + 2 * longwords,
3745     *m = scratch + 3 * longwords;
3746 
3747   reverse_words((julong *)a_ints, a, longwords);
3748   reverse_words((julong *)b_ints, b, longwords);
3749   reverse_words((julong *)n_ints, n, longwords);
3750 
3751   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3752 
3753   reverse_words(m, (julong *)m_ints, longwords);
3754 }
3755 
3756 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3757                                       jint len, jlong inv,
3758                                       jint *m_ints) {
3759   assert(len % 2 == 0, "array length in montgomery_square must be even");
3760   int longwords = len/2;
3761 
3762   // Make very sure we don't use so much space that the stack might
3763   // overflow.  512 jints corresponds to an 16384-bit integer and
3764   // will use here a total of 6k bytes of stack space.
3765   int total_allocation = longwords * sizeof (julong) * 3;
3766   guarantee(total_allocation <= 8192, "must be");
3767   julong *scratch = (julong *)alloca(total_allocation);
3768 
3769   // Local scratch arrays
3770   julong
3771     *a = scratch + 0 * longwords,
3772     *n = scratch + 1 * longwords,
3773     *m = scratch + 2 * longwords;
3774 
3775   reverse_words((julong *)a_ints, a, longwords);
3776   reverse_words((julong *)n_ints, n, longwords);
3777 
3778   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3779     ::montgomery_square(a, n, m, (julong)inv, longwords);
3780   } else {
3781     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3782   }
3783 
3784   reverse_words(m, (julong *)m_ints, longwords);
3785 }
3786 
3787 #ifdef COMPILER2
3788 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3789 //
3790 //------------------------------generate_exception_blob---------------------------
3791 // creates exception blob at the end
3792 // Using exception blob, this code is jumped from a compiled method.
3793 // (see emit_exception_handler in x86_64.ad file)
3794 //
3795 // Given an exception pc at a call we call into the runtime for the
3796 // handler in this method. This handler might merely restore state
3797 // (i.e. callee save registers) unwind the frame and jump to the
3798 // exception handler for the nmethod if there is no Java level handler
3799 // for the nmethod.
3800 //
3801 // This code is entered with a jmp.
3802 //
3803 // Arguments:
3804 //   rax: exception oop
3805 //   rdx: exception pc
3806 //
3807 // Results:
3808 //   rax: exception oop
3809 //   rdx: exception pc in caller or ???
3810 //   destination: exception handler of caller
3811 //
3812 // Note: the exception pc MUST be at a call (precise debug information)
3813 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3814 //
3815 
3816 void OptoRuntime::generate_exception_blob() {
3817   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3818   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3819   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3820 
3821   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3822 
3823   // Allocate space for the code
3824   ResourceMark rm;
3825   // Setup code generation tools
3826   CodeBuffer buffer("exception_blob", 2048, 1024);
3827   MacroAssembler* masm = new MacroAssembler(&buffer);
3828 
3829 
3830   address start = __ pc();
3831 
3832   // Exception pc is 'return address' for stack walker
3833   __ push(rdx);
3834   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3835 
3836   // Save callee-saved registers.  See x86_64.ad.
3837 
3838   // rbp is an implicitly saved callee saved register (i.e., the calling
3839   // convention will save/restore it in the prolog/epilog). Other than that
3840   // there are no callee save registers now that adapter frames are gone.
3841 
3842   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3843 
3844   // Store exception in Thread object. We cannot pass any arguments to the
3845   // handle_exception call, since we do not want to make any assumption
3846   // about the size of the frame where the exception happened in.
3847   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3848   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3849   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3850 
3851   // This call does all the hard work.  It checks if an exception handler
3852   // exists in the method.
3853   // If so, it returns the handler address.
3854   // If not, it prepares for stack-unwinding, restoring the callee-save
3855   // registers of the frame being removed.
3856   //
3857   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3858 
3859   // At a method handle call, the stack may not be properly aligned
3860   // when returning with an exception.
3861   address the_pc = __ pc();
3862   __ set_last_Java_frame(noreg, noreg, the_pc);
3863   __ mov(c_rarg0, r15_thread);
3864   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3865   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3866 
3867   // Set an oopmap for the call site.  This oopmap will only be used if we
3868   // are unwinding the stack.  Hence, all locations will be dead.
3869   // Callee-saved registers will be the same as the frame above (i.e.,
3870   // handle_exception_stub), since they were restored when we got the
3871   // exception.
3872 
3873   OopMapSet* oop_maps = new OopMapSet();
3874 
3875   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3876 
3877   __ reset_last_Java_frame(false);
3878 
3879   // Restore callee-saved registers
3880 
3881   // rbp is an implicitly saved callee-saved register (i.e., the calling
3882   // convention will save restore it in prolog/epilog) Other than that
3883   // there are no callee save registers now that adapter frames are gone.
3884 
3885   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3886 
3887   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3888   __ pop(rdx);                  // No need for exception pc anymore
3889 
3890   // rax: exception handler
3891 
3892   // We have a handler in rax (could be deopt blob).
3893   __ mov(r8, rax);
3894 
3895   // Get the exception oop
3896   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3897   // Get the exception pc in case we are deoptimized
3898   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3899 #ifdef ASSERT
3900   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3901   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3902 #endif
3903   // Clear the exception oop so GC no longer processes it as a root.
3904   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3905 
3906   // rax: exception oop
3907   // r8:  exception handler
3908   // rdx: exception pc
3909   // Jump to handler
3910 
3911   __ jmp(r8);
3912 
3913   // Make sure all code is generated
3914   masm->flush();
3915 
3916   // Set exception blob
3917   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3918 }
3919 #endif // COMPILER2
3920 
3921 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
3922                                        int total_in_args, const VMRegPair* in_regs,
3923                                        int total_out_args, VMRegPair* out_regs,
3924                                        GrowableArray<int>& arg_order,
3925                                        VMRegPair tmp_vmreg) {
3926   ComputeMoveOrder order(total_in_args, in_regs,
3927                          total_out_args, out_regs,
3928                          in_sig_bt, arg_order, tmp_vmreg);
3929 }