New src/hotspot/cpu/x86/sharedRuntime_x86

   1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/icBuffer.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/compiledICHolder.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/jniHandles.hpp"
  48 #include "runtime/safepointMechanism.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/signature.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "runtime/vframeArray.hpp"
  53 #include "runtime/vm_version.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/formatBuffer.hpp"
  56 #include "vmreg_x86.inline.hpp"
  57 #ifdef COMPILER1
  58 #include "c1/c1_Runtime1.hpp"
  59 #endif
  60 #ifdef COMPILER2
  61 #include "opto/runtime.hpp"
  62 #endif
  63 #if INCLUDE_JVMCI
  64 #include "jvmci/jvmciJavaClasses.hpp"
  65 #endif
  66 
  67 #define __ masm->
  68 
  69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  70 
  71 class SimpleRuntimeFrame {
  72 
  73   public:
  74 
  75   // Most of the runtime stubs have this simple frame layout.
  76   // This class exists to make the layout shared in one place.
  77   // Offsets are for compiler stack slots, which are jints.
  78   enum layout {
  79     // The frame sender code expects that rbp will be in the "natural" place and
  80     // will override any oopMap setting for it. We must therefore force the layout
  81     // so that it agrees with the frame sender code.
  82     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  83     rbp_off2,
  84     return_off, return_off2,
  85     framesize
  86   };
  87 };
  88 
  89 class RegisterSaver {
  90   // Capture info about frame layout.  Layout offsets are in jint
  91   // units because compiler frame slots are jints.
  92 #define XSAVE_AREA_BEGIN 160
  93 #define XSAVE_AREA_YMM_BEGIN 576
  94 #define XSAVE_AREA_OPMASK_BEGIN 1088
  95 #define XSAVE_AREA_ZMM_BEGIN 1152
  96 #define XSAVE_AREA_UPPERBANK 1664
  97 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  98 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  99 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 100 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 102   enum layout {
 103     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 104     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 105     DEF_XMM_OFFS(0),
 106     DEF_XMM_OFFS(1),
 107     // 2..15 are implied in range usage
 108     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 109     DEF_YMM_OFFS(0),
 110     DEF_YMM_OFFS(1),
 111     // 2..15 are implied in range usage
 112     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 113     DEF_OPMASK_OFFS(0),
 114     DEF_OPMASK_OFFS(1),
 115     // 2..7 are implied in range usage
 116     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 117     DEF_ZMM_OFFS(0),
 118     DEF_ZMM_OFFS(1),
 119     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 120     DEF_ZMM_UPPER_OFFS(16),
 121     DEF_ZMM_UPPER_OFFS(17),
 122     // 18..31 are implied in range usage
 123     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 124     fpu_stateH_end,
 125     r15_off, r15H_off,
 126     r14_off, r14H_off,
 127     r13_off, r13H_off,
 128     r12_off, r12H_off,
 129     r11_off, r11H_off,
 130     r10_off, r10H_off,
 131     r9_off,  r9H_off,
 132     r8_off,  r8H_off,
 133     rdi_off, rdiH_off,
 134     rsi_off, rsiH_off,
 135     ignore_off, ignoreH_off,  // extra copy of rbp
 136     rsp_off, rspH_off,
 137     rbx_off, rbxH_off,
 138     rdx_off, rdxH_off,
 139     rcx_off, rcxH_off,
 140     rax_off, raxH_off,
 141     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 142     align_off, alignH_off,
 143     flags_off, flagsH_off,
 144     // The frame sender code expects that rbp will be in the "natural" place and
 145     // will override any oopMap setting for it. We must therefore force the layout
 146     // so that it agrees with the frame sender code.
 147     rbp_off, rbpH_off,        // copy of rbp we will restore
 148     return_off, returnH_off,  // slot for return address
 149     reg_save_size             // size in compiler stack slots
 150   };
 151 
 152  public:
 153   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 154   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 155 
 156   // Offsets into the register save area
 157   // Used by deoptimization when it is managing result register
 158   // values on its own
 159 
 160   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 161   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 162   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 163   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 164   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 165 
 166   // During deoptimization only the result registers need to be restored,
 167   // all the other values have already been extracted.
 168   static void restore_result_registers(MacroAssembler* masm);
 169 };
 170 
 171 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 172   int off = 0;
 173   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 174   if (UseAVX < 3) {
 175     num_xmm_regs = num_xmm_regs/2;
 176   }
 177 #if COMPILER2_OR_JVMCI
 178   if (save_wide_vectors && UseAVX == 0) {
 179     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 180   }
 181   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 182 #else
 183   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 184 #endif
 185 
 186   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 187   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 188   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 189   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 190   // CodeBlob frame size is in words.
 191   int frame_size_in_words = frame_size_in_bytes / wordSize;
 192   *total_frame_words = frame_size_in_words;
 193 
 194   // Save registers, fpu state, and flags.
 195   // We assume caller has already pushed the return address onto the
 196   // stack, so rsp is 8-byte aligned here.
 197   // We push rpb twice in this sequence because we want the real rbp
 198   // to be under the return like a normal enter.
 199 
 200   __ enter();          // rsp becomes 16-byte aligned here
 201   __ push_CPU_state(); // Push a multiple of 16 bytes
 202 
 203   // push cpu state handles this on EVEX enabled targets
 204   if (save_wide_vectors) {
 205     // Save upper half of YMM registers(0..15)
 206     int base_addr = XSAVE_AREA_YMM_BEGIN;
 207     for (int n = 0; n < 16; n++) {
 208       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 209     }
 210     if (VM_Version::supports_evex()) {
 211       // Save upper half of ZMM registers(0..15)
 212       base_addr = XSAVE_AREA_ZMM_BEGIN;
 213       for (int n = 0; n < 16; n++) {
 214         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 215       }
 216       // Save full ZMM registers(16..num_xmm_regs)
 217       base_addr = XSAVE_AREA_UPPERBANK;
 218       off = 0;
 219       int vector_len = Assembler::AVX_512bit;
 220       for (int n = 16; n < num_xmm_regs; n++) {
 221         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 222       }
 223 #if COMPILER2_OR_JVMCI
 224       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 225       off = 0;
 226       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 227         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 228       }
 229 #endif
 230     }
 231   } else {
 232     if (VM_Version::supports_evex()) {
 233       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 234       int base_addr = XSAVE_AREA_UPPERBANK;
 235       off = 0;
 236       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 237       for (int n = 16; n < num_xmm_regs; n++) {
 238         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 239       }
 240 #if COMPILER2_OR_JVMCI
 241       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 242       off = 0;
 243       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 244         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 245       }
 246 #endif
 247     }
 248   }
 249   __ vzeroupper();
 250   if (frame::arg_reg_save_area_bytes != 0) {
 251     // Allocate argument register save area
 252     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 253   }
 254 
 255   // Set an oopmap for the call site.  This oopmap will map all
 256   // oop-registers and debug-info registers as callee-saved.  This
 257   // will allow deoptimization at this safepoint to find all possible
 258   // debug-info recordings, as well as let GC find all oops.
 259 
 260   OopMapSet *oop_maps = new OopMapSet();
 261   OopMap* map = new OopMap(frame_size_in_slots, 0);
 262 
 263 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 264 
 265   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 266   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 267   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 269   // rbp location is known implicitly by the frame sender code, needs no oopmap
 270   // and the location where rbp was saved by is ignored
 271   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 272   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 273   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 281   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 282   // on EVEX enabled targets, we get it included in the xsave area
 283   off = xmm0_off;
 284   int delta = xmm1_off - off;
 285   for (int n = 0; n < 16; n++) {
 286     XMMRegister xmm_name = as_XMMRegister(n);
 287     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 288     off += delta;
 289   }
 290   if (UseAVX > 2) {
 291     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 292     off = zmm16_off;
 293     delta = zmm17_off - off;
 294     for (int n = 16; n < num_xmm_regs; n++) {
 295       XMMRegister zmm_name = as_XMMRegister(n);
 296       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 297       off += delta;
 298     }
 299   }
 300 
 301 #if COMPILER2_OR_JVMCI
 302   if (save_wide_vectors) {
 303     // Save upper half of YMM registers(0..15)
 304     off = ymm0_off;
 305     delta = ymm1_off - ymm0_off;
 306     for (int n = 0; n < 16; n++) {
 307       XMMRegister ymm_name = as_XMMRegister(n);
 308       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 309       off += delta;
 310     }
 311     if (VM_Version::supports_evex()) {
 312       // Save upper half of ZMM registers(0..15)
 313       off = zmm0_off;
 314       delta = zmm1_off - zmm0_off;
 315       for (int n = 0; n < 16; n++) {
 316         XMMRegister zmm_name = as_XMMRegister(n);
 317         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 318         off += delta;
 319       }
 320     }
 321   }
 322 #endif // COMPILER2_OR_JVMCI
 323 
 324   // %%% These should all be a waste but we'll keep things as they were for now
 325   if (true) {
 326     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 327     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 328     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 330     // rbp location is known implicitly by the frame sender code, needs no oopmap
 331     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 341     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 342     // on EVEX enabled targets, we get it included in the xsave area
 343     off = xmm0H_off;
 344     delta = xmm1H_off - off;
 345     for (int n = 0; n < 16; n++) {
 346       XMMRegister xmm_name = as_XMMRegister(n);
 347       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 348       off += delta;
 349     }
 350     if (UseAVX > 2) {
 351       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 352       off = zmm16H_off;
 353       delta = zmm17H_off - off;
 354       for (int n = 16; n < num_xmm_regs; n++) {
 355         XMMRegister zmm_name = as_XMMRegister(n);
 356         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 357         off += delta;
 358       }
 359     }
 360   }
 361 
 362   return map;
 363 }
 364 
 365 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 366   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 367   if (UseAVX < 3) {
 368     num_xmm_regs = num_xmm_regs/2;
 369   }
 370   if (frame::arg_reg_save_area_bytes != 0) {
 371     // Pop arg register save area
 372     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 373   }
 374 
 375 #if COMPILER2_OR_JVMCI
 376   if (restore_wide_vectors) {
 377     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 378     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 379   }
 380 #else
 381   assert(!restore_wide_vectors, "vectors are generated only by C2");
 382 #endif
 383 
 384   __ vzeroupper();
 385 
 386   // On EVEX enabled targets everything is handled in pop fpu state
 387   if (restore_wide_vectors) {
 388     // Restore upper half of YMM registers (0..15)
 389     int base_addr = XSAVE_AREA_YMM_BEGIN;
 390     for (int n = 0; n < 16; n++) {
 391       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 392     }
 393     if (VM_Version::supports_evex()) {
 394       // Restore upper half of ZMM registers (0..15)
 395       base_addr = XSAVE_AREA_ZMM_BEGIN;
 396       for (int n = 0; n < 16; n++) {
 397         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 398       }
 399       // Restore full ZMM registers(16..num_xmm_regs)
 400       base_addr = XSAVE_AREA_UPPERBANK;
 401       int vector_len = Assembler::AVX_512bit;
 402       int off = 0;
 403       for (int n = 16; n < num_xmm_regs; n++) {
 404         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 405       }
 406 #if COMPILER2_OR_JVMCI
 407       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 408       off = 0;
 409       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 410         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 411       }
 412 #endif
 413     }
 414   } else {
 415     if (VM_Version::supports_evex()) {
 416       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 417       int base_addr = XSAVE_AREA_UPPERBANK;
 418       int off = 0;
 419       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 420       for (int n = 16; n < num_xmm_regs; n++) {
 421         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 422       }
 423 #if COMPILER2_OR_JVMCI
 424       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 425       off = 0;
 426       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 427         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 428       }
 429 #endif
 430     }
 431   }
 432 
 433   // Recover CPU state
 434   __ pop_CPU_state();
 435   // Get the rbp described implicitly by the calling convention (no oopMap)
 436   __ pop(rbp);
 437 }
 438 
 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 440 
 441   // Just restore result register. Only used by deoptimization. By
 442   // now any callee save register that needs to be restored to a c2
 443   // caller of the deoptee has been extracted into the vframeArray
 444   // and will be stuffed into the c2i adapter we create for later
 445   // restoration so only result registers need to be restored here.
 446 
 447   // Restore fp result register
 448   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 449   // Restore integer result register
 450   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 451   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 452 
 453   // Pop all of the register save are off the stack except the return address
 454   __ addptr(rsp, return_offset_in_bytes());
 455 }
 456 
 457 // Is vector's size (in bytes) bigger than a size saved by default?
 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 459 bool SharedRuntime::is_wide_vector(int size) {
 460   return size > 16;
 461 }
 462 
 463 // ---------------------------------------------------------------------------
 464 // Read the array of BasicTypes from a signature, and compute where the
 465 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 466 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 467 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 468 // as framesizes are fixed.
 469 // VMRegImpl::stack0 refers to the first slot 0(sp).
 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 471 // up to RegisterImpl::number_of_registers) are the 64-bit
 472 // integer registers.
 473 
 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 475 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 476 // units regardless of build. Of course for i486 there is no 64 bit build
 477 
 478 // The Java calling convention is a "shifted" version of the C ABI.
 479 // By skipping the first C ABI register we can call non-static jni methods
 480 // with small numbers of arguments without having to shuffle the arguments
 481 // at all. Since we control the java ABI we ought to at least get some
 482 // advantage out of it.
 483 
 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 485                                            VMRegPair *regs,
 486                                            int total_args_passed) {
 487 
 488   // Create the mapping between argument positions and
 489   // registers.
 490   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 491     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 492   };
 493   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 494     j_farg0, j_farg1, j_farg2, j_farg3,
 495     j_farg4, j_farg5, j_farg6, j_farg7
 496   };
 497 
 498 
 499   uint int_args = 0;
 500   uint fp_args = 0;
 501   uint stk_args = 0; // inc by 2 each time
 502 
 503   for (int i = 0; i < total_args_passed; i++) {
 504     switch (sig_bt[i]) {
 505     case T_BOOLEAN:
 506     case T_CHAR:
 507     case T_BYTE:
 508     case T_SHORT:
 509     case T_INT:
 510       if (int_args < Argument::n_int_register_parameters_j) {
 511         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 512       } else {
 513         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 514         stk_args += 2;
 515       }
 516       break;
 517     case T_VOID:
 518       // halves of T_LONG or T_DOUBLE
 519       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 520       regs[i].set_bad();
 521       break;
 522     case T_LONG:
 523       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 524       // fall through
 525     case T_OBJECT:
 526     case T_ARRAY:
 527     case T_ADDRESS:
 528       if (int_args < Argument::n_int_register_parameters_j) {
 529         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 530       } else {
 531         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 532         stk_args += 2;
 533       }
 534       break;
 535     case T_FLOAT:
 536       if (fp_args < Argument::n_float_register_parameters_j) {
 537         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 538       } else {
 539         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 540         stk_args += 2;
 541       }
 542       break;
 543     case T_DOUBLE:
 544       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 545       if (fp_args < Argument::n_float_register_parameters_j) {
 546         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 547       } else {
 548         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 549         stk_args += 2;
 550       }
 551       break;
 552     default:
 553       ShouldNotReachHere();
 554       break;
 555     }
 556   }
 557 
 558   return align_up(stk_args, 2);
 559 }
 560 
 561 // Patch the callers callsite with entry to compiled code if it exists.
 562 static void patch_callers_callsite(MacroAssembler *masm) {
 563   Label L;
 564   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 565   __ jcc(Assembler::equal, L);
 566 
 567   // Save the current stack pointer
 568   __ mov(r13, rsp);
 569   // Schedule the branch target address early.
 570   // Call into the VM to patch the caller, then jump to compiled callee
 571   // rax isn't live so capture return address while we easily can
 572   __ movptr(rax, Address(rsp, 0));
 573 
 574   // align stack so push_CPU_state doesn't fault
 575   __ andptr(rsp, -(StackAlignmentInBytes));
 576   __ push_CPU_state();
 577   __ vzeroupper();
 578   // VM needs caller's callsite
 579   // VM needs target method
 580   // This needs to be a long call since we will relocate this adapter to
 581   // the codeBuffer and it may not reach
 582 
 583   // Allocate argument register save area
 584   if (frame::arg_reg_save_area_bytes != 0) {
 585     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 586   }
 587   __ mov(c_rarg0, rbx);
 588   __ mov(c_rarg1, rax);
 589   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 590 
 591   // De-allocate argument register save area
 592   if (frame::arg_reg_save_area_bytes != 0) {
 593     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 594   }
 595 
 596   __ vzeroupper();
 597   __ pop_CPU_state();
 598   // restore sp
 599   __ mov(rsp, r13);
 600   __ bind(L);
 601 }
 602 
 603 
 604 static void gen_c2i_adapter(MacroAssembler *masm,
 605                             int total_args_passed,
 606                             int comp_args_on_stack,
 607                             const BasicType *sig_bt,
 608                             const VMRegPair *regs,
 609                             Label& skip_fixup) {
 610   // Before we get into the guts of the C2I adapter, see if we should be here
 611   // at all.  We've come from compiled code and are attempting to jump to the
 612   // interpreter, which means the caller made a static call to get here
 613   // (vcalls always get a compiled target if there is one).  Check for a
 614   // compiled target.  If there is one, we need to patch the caller's call.
 615   patch_callers_callsite(masm);
 616 
 617   __ bind(skip_fixup);
 618 
 619   // Since all args are passed on the stack, total_args_passed *
 620   // Interpreter::stackElementSize is the space we need. Plus 1 because
 621   // we also account for the return address location since
 622   // we store it first rather than hold it in rax across all the shuffling
 623 
 624   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 625 
 626   // stack is aligned, keep it that way
 627   extraspace = align_up(extraspace, 2*wordSize);
 628 
 629   // Get return address
 630   __ pop(rax);
 631 
 632   // set senderSP value
 633   __ mov(r13, rsp);
 634 
 635   __ subptr(rsp, extraspace);
 636 
 637   // Store the return address in the expected location
 638   __ movptr(Address(rsp, 0), rax);
 639 
 640   // Now write the args into the outgoing interpreter space
 641   for (int i = 0; i < total_args_passed; i++) {
 642     if (sig_bt[i] == T_VOID) {
 643       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 644       continue;
 645     }
 646 
 647     // offset to start parameters
 648     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 649     int next_off = st_off - Interpreter::stackElementSize;
 650 
 651     // Say 4 args:
 652     // i   st_off
 653     // 0   32 T_LONG
 654     // 1   24 T_VOID
 655     // 2   16 T_OBJECT
 656     // 3    8 T_BOOL
 657     // -    0 return address
 658     //
 659     // However to make thing extra confusing. Because we can fit a long/double in
 660     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 661     // leaves one slot empty and only stores to a single slot. In this case the
 662     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 663 
 664     VMReg r_1 = regs[i].first();
 665     VMReg r_2 = regs[i].second();
 666     if (!r_1->is_valid()) {
 667       assert(!r_2->is_valid(), "");
 668       continue;
 669     }
 670     if (r_1->is_stack()) {
 671       // memory to memory use rax
 672       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 673       if (!r_2->is_valid()) {
 674         // sign extend??
 675         __ movl(rax, Address(rsp, ld_off));
 676         __ movptr(Address(rsp, st_off), rax);
 677 
 678       } else {
 679 
 680         __ movq(rax, Address(rsp, ld_off));
 681 
 682         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 683         // T_DOUBLE and T_LONG use two slots in the interpreter
 684         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 685           // ld_off == LSW, ld_off+wordSize == MSW
 686           // st_off == MSW, next_off == LSW
 687           __ movq(Address(rsp, next_off), rax);
 688 #ifdef ASSERT
 689           // Overwrite the unused slot with known junk
 690           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 691           __ movptr(Address(rsp, st_off), rax);
 692 #endif /* ASSERT */
 693         } else {
 694           __ movq(Address(rsp, st_off), rax);
 695         }
 696       }
 697     } else if (r_1->is_Register()) {
 698       Register r = r_1->as_Register();
 699       if (!r_2->is_valid()) {
 700         // must be only an int (or less ) so move only 32bits to slot
 701         // why not sign extend??
 702         __ movl(Address(rsp, st_off), r);
 703       } else {
 704         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 705         // T_DOUBLE and T_LONG use two slots in the interpreter
 706         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 707           // long/double in gpr
 708 #ifdef ASSERT
 709           // Overwrite the unused slot with known junk
 710           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 711           __ movptr(Address(rsp, st_off), rax);
 712 #endif /* ASSERT */
 713           __ movq(Address(rsp, next_off), r);
 714         } else {
 715           __ movptr(Address(rsp, st_off), r);
 716         }
 717       }
 718     } else {
 719       assert(r_1->is_XMMRegister(), "");
 720       if (!r_2->is_valid()) {
 721         // only a float use just part of the slot
 722         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 723       } else {
 724 #ifdef ASSERT
 725         // Overwrite the unused slot with known junk
 726         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 727         __ movptr(Address(rsp, st_off), rax);
 728 #endif /* ASSERT */
 729         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 730       }
 731     }
 732   }
 733 
 734   // Schedule the branch target address early.
 735   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 736   __ jmp(rcx);
 737 }
 738 
 739 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 740                         address code_start, address code_end,
 741                         Label& L_ok) {
 742   Label L_fail;
 743   __ lea(temp_reg, ExternalAddress(code_start));
 744   __ cmpptr(pc_reg, temp_reg);
 745   __ jcc(Assembler::belowEqual, L_fail);
 746   __ lea(temp_reg, ExternalAddress(code_end));
 747   __ cmpptr(pc_reg, temp_reg);
 748   __ jcc(Assembler::below, L_ok);
 749   __ bind(L_fail);
 750 }
 751 
 752 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 753                                     int total_args_passed,
 754                                     int comp_args_on_stack,
 755                                     const BasicType *sig_bt,
 756                                     const VMRegPair *regs) {
 757 
 758   // Note: r13 contains the senderSP on entry. We must preserve it since
 759   // we may do a i2c -> c2i transition if we lose a race where compiled
 760   // code goes non-entrant while we get args ready.
 761   // In addition we use r13 to locate all the interpreter args as
 762   // we must align the stack to 16 bytes on an i2c entry else we
 763   // lose alignment we expect in all compiled code and register
 764   // save code can segv when fxsave instructions find improperly
 765   // aligned stack pointer.
 766 
 767   // Adapters can be frameless because they do not require the caller
 768   // to perform additional cleanup work, such as correcting the stack pointer.
 769   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 770   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 771   // even if a callee has modified the stack pointer.
 772   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 773   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 774   // up via the senderSP register).
 775   // In other words, if *either* the caller or callee is interpreted, we can
 776   // get the stack pointer repaired after a call.
 777   // This is why c2i and i2c adapters cannot be indefinitely composed.
 778   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 779   // both caller and callee would be compiled methods, and neither would
 780   // clean up the stack pointer changes performed by the two adapters.
 781   // If this happens, control eventually transfers back to the compiled
 782   // caller, but with an uncorrected stack, causing delayed havoc.
 783 
 784   // Pick up the return address
 785   __ movptr(rax, Address(rsp, 0));
 786 
 787   if (VerifyAdapterCalls &&
 788       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 789     // So, let's test for cascading c2i/i2c adapters right now.
 790     //  assert(Interpreter::contains($return_addr) ||
 791     //         StubRoutines::contains($return_addr),
 792     //         "i2c adapter must return to an interpreter frame");
 793     __ block_comment("verify_i2c { ");
 794     Label L_ok;
 795     if (Interpreter::code() != NULL)
 796       range_check(masm, rax, r11,
 797                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 798                   L_ok);
 799     if (StubRoutines::code1() != NULL)
 800       range_check(masm, rax, r11,
 801                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 802                   L_ok);
 803     if (StubRoutines::code2() != NULL)
 804       range_check(masm, rax, r11,
 805                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 806                   L_ok);
 807     const char* msg = "i2c adapter must return to an interpreter frame";
 808     __ block_comment(msg);
 809     __ stop(msg);
 810     __ bind(L_ok);
 811     __ block_comment("} verify_i2ce ");
 812   }
 813 
 814   // Must preserve original SP for loading incoming arguments because
 815   // we need to align the outgoing SP for compiled code.
 816   __ movptr(r11, rsp);
 817 
 818   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
 819   // in registers, we will occasionally have no stack args.
 820   int comp_words_on_stack = 0;
 821   if (comp_args_on_stack) {
 822     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
 823     // registers are below.  By subtracting stack0, we either get a negative
 824     // number (all values in registers) or the maximum stack slot accessed.
 825 
 826     // Convert 4-byte c2 stack slots to words.
 827     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 828     // Round up to miminum stack alignment, in wordSize
 829     comp_words_on_stack = align_up(comp_words_on_stack, 2);
 830     __ subptr(rsp, comp_words_on_stack * wordSize);
 831   }
 832 
 833 
 834   // Ensure compiled code always sees stack at proper alignment
 835   __ andptr(rsp, -16);
 836 
 837   // push the return address and misalign the stack that youngest frame always sees
 838   // as far as the placement of the call instruction
 839   __ push(rax);
 840 
 841   // Put saved SP in another register
 842   const Register saved_sp = rax;
 843   __ movptr(saved_sp, r11);
 844 
 845   // Will jump to the compiled code just as if compiled code was doing it.
 846   // Pre-load the register-jump target early, to schedule it better.
 847   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 848 
 849 #if INCLUDE_JVMCI
 850   if (EnableJVMCI) {
 851     // check if this call should be routed towards a specific entry point
 852     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 853     Label no_alternative_target;
 854     __ jcc(Assembler::equal, no_alternative_target);
 855     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 856     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 857     __ bind(no_alternative_target);
 858   }
 859 #endif // INCLUDE_JVMCI
 860 
 861   // Now generate the shuffle code.  Pick up all register args and move the
 862   // rest through the floating point stack top.
 863   for (int i = 0; i < total_args_passed; i++) {
 864     if (sig_bt[i] == T_VOID) {
 865       // Longs and doubles are passed in native word order, but misaligned
 866       // in the 32-bit build.
 867       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 868       continue;
 869     }
 870 
 871     // Pick up 0, 1 or 2 words from SP+offset.
 872 
 873     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 874             "scrambled load targets?");
 875     // Load in argument order going down.
 876     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 877     // Point to interpreter value (vs. tag)
 878     int next_off = ld_off - Interpreter::stackElementSize;
 879     //
 880     //
 881     //
 882     VMReg r_1 = regs[i].first();
 883     VMReg r_2 = regs[i].second();
 884     if (!r_1->is_valid()) {
 885       assert(!r_2->is_valid(), "");
 886       continue;
 887     }
 888     if (r_1->is_stack()) {
 889       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 890       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 891 
 892       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 893       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 894       // will be generated.
 895       if (!r_2->is_valid()) {
 896         // sign extend???
 897         __ movl(r13, Address(saved_sp, ld_off));
 898         __ movptr(Address(rsp, st_off), r13);
 899       } else {
 900         //
 901         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 902         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 903         // So we must adjust where to pick up the data to match the interpreter.
 904         //
 905         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 906         // are accessed as negative so LSW is at LOW address
 907 
 908         // ld_off is MSW so get LSW
 909         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 910                            next_off : ld_off;
 911         __ movq(r13, Address(saved_sp, offset));
 912         // st_off is LSW (i.e. reg.first())
 913         __ movq(Address(rsp, st_off), r13);
 914       }
 915     } else if (r_1->is_Register()) {  // Register argument
 916       Register r = r_1->as_Register();
 917       assert(r != rax, "must be different");
 918       if (r_2->is_valid()) {
 919         //
 920         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 921         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 922         // So we must adjust where to pick up the data to match the interpreter.
 923 
 924         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 925                            next_off : ld_off;
 926 
 927         // this can be a misaligned move
 928         __ movq(r, Address(saved_sp, offset));
 929       } else {
 930         // sign extend and use a full word?
 931         __ movl(r, Address(saved_sp, ld_off));
 932       }
 933     } else {
 934       if (!r_2->is_valid()) {
 935         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 936       } else {
 937         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 938       }
 939     }
 940   }
 941 
 942   // 6243940 We might end up in handle_wrong_method if
 943   // the callee is deoptimized as we race thru here. If that
 944   // happens we don't want to take a safepoint because the
 945   // caller frame will look interpreted and arguments are now
 946   // "compiled" so it is much better to make this transition
 947   // invisible to the stack walking code. Unfortunately if
 948   // we try and find the callee by normal means a safepoint
 949   // is possible. So we stash the desired callee in the thread
 950   // and the vm will find there should this case occur.
 951 
 952   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 953 
 954   // put Method* where a c2i would expect should we end up there
 955   // only needed becaus eof c2 resolve stubs return Method* as a result in
 956   // rax
 957   __ mov(rax, rbx);
 958   __ jmp(r11);
 959 }
 960 
 961 // ---------------------------------------------------------------
 962 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 963                                                             int total_args_passed,
 964                                                             int comp_args_on_stack,
 965                                                             const BasicType *sig_bt,
 966                                                             const VMRegPair *regs,
 967                                                             AdapterFingerPrint* fingerprint) {
 968   address i2c_entry = __ pc();
 969 
 970   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 971 
 972   // -------------------------------------------------------------------------
 973   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 974   // to the interpreter.  The args start out packed in the compiled layout.  They
 975   // need to be unpacked into the interpreter layout.  This will almost always
 976   // require some stack space.  We grow the current (compiled) stack, then repack
 977   // the args.  We  finally end in a jump to the generic interpreter entry point.
 978   // On exit from the interpreter, the interpreter will restore our SP (lest the
 979   // compiled code, which relys solely on SP and not RBP, get sick).
 980 
 981   address c2i_unverified_entry = __ pc();
 982   Label skip_fixup;
 983   Label ok;
 984 
 985   Register holder = rax;
 986   Register receiver = j_rarg0;
 987   Register temp = rbx;
 988 
 989   {
 990     __ load_klass(temp, receiver, rscratch1);
 991     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
 992     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
 993     __ jcc(Assembler::equal, ok);
 994     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 995 
 996     __ bind(ok);
 997     // Method might have been compiled since the call site was patched to
 998     // interpreted if that is the case treat it as a miss so we can get
 999     // the call site corrected.
1000     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1001     __ jcc(Assembler::equal, skip_fixup);
1002     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1003   }
1004 
1005   address c2i_entry = __ pc();
1006 
1007   // Class initialization barrier for static methods
1008   address c2i_no_clinit_check_entry = NULL;
1009   if (VM_Version::supports_fast_class_init_checks()) {
1010     Label L_skip_barrier;
1011     Register method = rbx;
1012 
1013     { // Bypass the barrier for non-static methods
1014       Register flags  = rscratch1;
1015       __ movl(flags, Address(method, Method::access_flags_offset()));
1016       __ testl(flags, JVM_ACC_STATIC);
1017       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1018     }
1019 
1020     Register klass = rscratch1;
1021     __ load_method_holder(klass, method);
1022     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1023 
1024     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1025 
1026     __ bind(L_skip_barrier);
1027     c2i_no_clinit_check_entry = __ pc();
1028   }
1029 
1030   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1031   bs->c2i_entry_barrier(masm);
1032 
1033   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1034 
1035   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1036 }
1037 
1038 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1039                                          VMRegPair *regs,
1040                                          VMRegPair *regs2,
1041                                          int total_args_passed) {
1042   assert(regs2 == NULL, "not needed on x86");
1043 // We return the amount of VMRegImpl stack slots we need to reserve for all
1044 // the arguments NOT counting out_preserve_stack_slots.
1045 
1046 // NOTE: These arrays will have to change when c1 is ported
1047 #ifdef _WIN64
1048     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1049       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1050     };
1051     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1052       c_farg0, c_farg1, c_farg2, c_farg3
1053     };
1054 #else
1055     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1056       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1057     };
1058     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1059       c_farg0, c_farg1, c_farg2, c_farg3,
1060       c_farg4, c_farg5, c_farg6, c_farg7
1061     };
1062 #endif // _WIN64
1063 
1064 
1065     uint int_args = 0;
1066     uint fp_args = 0;
1067     uint stk_args = 0; // inc by 2 each time
1068 
1069     for (int i = 0; i < total_args_passed; i++) {
1070       switch (sig_bt[i]) {
1071       case T_BOOLEAN:
1072       case T_CHAR:
1073       case T_BYTE:
1074       case T_SHORT:
1075       case T_INT:
1076         if (int_args < Argument::n_int_register_parameters_c) {
1077           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1078 #ifdef _WIN64
1079           fp_args++;
1080           // Allocate slots for callee to stuff register args the stack.
1081           stk_args += 2;
1082 #endif
1083         } else {
1084           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1085           stk_args += 2;
1086         }
1087         break;
1088       case T_LONG:
1089         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1090         // fall through
1091       case T_OBJECT:
1092       case T_ARRAY:
1093       case T_ADDRESS:
1094       case T_METADATA:
1095         if (int_args < Argument::n_int_register_parameters_c) {
1096           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1097 #ifdef _WIN64
1098           fp_args++;
1099           stk_args += 2;
1100 #endif
1101         } else {
1102           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1103           stk_args += 2;
1104         }
1105         break;
1106       case T_FLOAT:
1107         if (fp_args < Argument::n_float_register_parameters_c) {
1108           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1109 #ifdef _WIN64
1110           int_args++;
1111           // Allocate slots for callee to stuff register args the stack.
1112           stk_args += 2;
1113 #endif
1114         } else {
1115           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1116           stk_args += 2;
1117         }
1118         break;
1119       case T_DOUBLE:
1120         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1121         if (fp_args < Argument::n_float_register_parameters_c) {
1122           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1123 #ifdef _WIN64
1124           int_args++;
1125           // Allocate slots for callee to stuff register args the stack.
1126           stk_args += 2;
1127 #endif
1128         } else {
1129           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1130           stk_args += 2;
1131         }
1132         break;
1133       case T_VOID: // Halves of longs and doubles
1134         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1135         regs[i].set_bad();
1136         break;
1137       default:
1138         ShouldNotReachHere();
1139         break;
1140       }
1141     }
1142 #ifdef _WIN64
1143   // windows abi requires that we always allocate enough stack space
1144   // for 4 64bit registers to be stored down.
1145   if (stk_args < 8) {
1146     stk_args = 8;
1147   }
1148 #endif // _WIN64
1149 
1150   return stk_args;
1151 }
1152 
1153 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1154                                              uint num_bits,
1155                                              uint total_args_passed) {
1156   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1157          "only certain vector sizes are supported for now");
1158 
1159   static const XMMRegister VEC_ArgReg[32] = {
1160      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1161      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1162     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1163     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1164   };
1165 
1166   uint stk_args = 0;
1167   uint fp_args = 0;
1168 
1169   for (uint i = 0; i < total_args_passed; i++) {
1170     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1171     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1172     regs[i].set_pair(vmreg->next(next_val), vmreg);
1173   }
1174 
1175   return stk_args;
1176 }
1177 
1178 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1179   // We always ignore the frame_slots arg and just use the space just below frame pointer
1180   // which by this time is free to use
1181   switch (ret_type) {
1182   case T_FLOAT:
1183     __ movflt(Address(rbp, -wordSize), xmm0);
1184     break;
1185   case T_DOUBLE:
1186     __ movdbl(Address(rbp, -wordSize), xmm0);
1187     break;
1188   case T_VOID:  break;
1189   default: {
1190     __ movptr(Address(rbp, -wordSize), rax);
1191     }
1192   }
1193 }
1194 
1195 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1196   // We always ignore the frame_slots arg and just use the space just below frame pointer
1197   // which by this time is free to use
1198   switch (ret_type) {
1199   case T_FLOAT:
1200     __ movflt(xmm0, Address(rbp, -wordSize));
1201     break;
1202   case T_DOUBLE:
1203     __ movdbl(xmm0, Address(rbp, -wordSize));
1204     break;
1205   case T_VOID:  break;
1206   default: {
1207     __ movptr(rax, Address(rbp, -wordSize));
1208     }
1209   }
1210 }
1211 
1212 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1213     for ( int i = first_arg ; i < arg_count ; i++ ) {
1214       if (args[i].first()->is_Register()) {
1215         __ push(args[i].first()->as_Register());
1216       } else if (args[i].first()->is_XMMRegister()) {
1217         __ subptr(rsp, 2*wordSize);
1218         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1219       }
1220     }
1221 }
1222 
1223 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1224     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1225       if (args[i].first()->is_Register()) {
1226         __ pop(args[i].first()->as_Register());
1227       } else if (args[i].first()->is_XMMRegister()) {
1228         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1229         __ addptr(rsp, 2*wordSize);
1230       }
1231     }
1232 }
1233 
1234 // Unpack an array argument into a pointer to the body and the length
1235 // if the array is non-null, otherwise pass 0 for both.
1236 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
1237   Register tmp_reg = rax;
1238   assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
1239          "possible collision");
1240   assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
1241          "possible collision");
1242 
1243   __ block_comment("unpack_array_argument {");
1244 
1245   // Pass the length, ptr pair
1246   Label is_null, done;
1247   VMRegPair tmp;
1248   tmp.set_ptr(tmp_reg->as_VMReg());
1249   if (reg.first()->is_stack()) {
1250     // Load the arg up from the stack
1251     __ move_ptr(reg, tmp);
1252     reg = tmp;
1253   }
1254   __ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1255   __ jccb(Assembler::equal, is_null);
1256   __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1257   __ move_ptr(tmp, body_arg);
1258   // load the length relative to the body.
1259   __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() -
1260                            arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1261   __ move32_64(tmp, length_arg);
1262   __ jmpb(done);
1263   __ bind(is_null);
1264   // Pass zeros
1265   __ xorptr(tmp_reg, tmp_reg);
1266   __ move_ptr(tmp, body_arg);
1267   __ move32_64(tmp, length_arg);
1268   __ bind(done);
1269 
1270   __ block_comment("} unpack_array_argument");
1271 }
1272 
1273 
1274 // Different signatures may require very different orders for the move
1275 // to avoid clobbering other arguments.  There's no simple way to
1276 // order them safely.  Compute a safe order for issuing stores and
1277 // break any cycles in those stores.  This code is fairly general but
1278 // it's not necessary on the other platforms so we keep it in the
1279 // platform dependent code instead of moving it into a shared file.
1280 // (See bugs 7013347 & 7145024.)
1281 // Note that this code is specific to LP64.
1282 class ComputeMoveOrder: public StackObj {
1283   class MoveOperation: public ResourceObj {
1284     friend class ComputeMoveOrder;
1285    private:
1286     VMRegPair        _src;
1287     VMRegPair        _dst;
1288     int              _src_index;
1289     int              _dst_index;
1290     bool             _processed;
1291     MoveOperation*  _next;
1292     MoveOperation*  _prev;
1293 
1294     static int get_id(VMRegPair r) {
1295       return r.first()->value();
1296     }
1297 
1298    public:
1299     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1300       _src(src)
1301     , _dst(dst)
1302     , _src_index(src_index)
1303     , _dst_index(dst_index)
1304     , _processed(false)
1305     , _next(NULL)
1306     , _prev(NULL) {
1307     }
1308 
1309     VMRegPair src() const              { return _src; }
1310     int src_id() const                 { return get_id(src()); }
1311     int src_index() const              { return _src_index; }
1312     VMRegPair dst() const              { return _dst; }
1313     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1314     int dst_index() const              { return _dst_index; }
1315     int dst_id() const                 { return get_id(dst()); }
1316     MoveOperation* next() const       { return _next; }
1317     MoveOperation* prev() const       { return _prev; }
1318     void set_processed()               { _processed = true; }
1319     bool is_processed() const          { return _processed; }
1320 
1321     // insert
1322     void break_cycle(VMRegPair temp_register) {
1323       // create a new store following the last store
1324       // to move from the temp_register to the original
1325       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1326 
1327       // break the cycle of links and insert new_store at the end
1328       // break the reverse link.
1329       MoveOperation* p = prev();
1330       assert(p->next() == this, "must be");
1331       _prev = NULL;
1332       p->_next = new_store;
1333       new_store->_prev = p;
1334 
1335       // change the original store to save it's value in the temp.
1336       set_dst(-1, temp_register);
1337     }
1338 
1339     void link(GrowableArray<MoveOperation*>& killer) {
1340       // link this store in front the store that it depends on
1341       MoveOperation* n = killer.at_grow(src_id(), NULL);
1342       if (n != NULL) {
1343         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1344         _next = n;
1345         n->_prev = this;
1346       }
1347     }
1348   };
1349 
1350  private:
1351   GrowableArray<MoveOperation*> edges;
1352 
1353  public:
1354   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1355                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1356     // Move operations where the dest is the stack can all be
1357     // scheduled first since they can't interfere with the other moves.
1358     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1359       if (in_sig_bt[i] == T_ARRAY) {
1360         c_arg--;
1361         if (out_regs[c_arg].first()->is_stack() &&
1362             out_regs[c_arg + 1].first()->is_stack()) {
1363           arg_order.push(i);
1364           arg_order.push(c_arg);
1365         } else {
1366           if (out_regs[c_arg].first()->is_stack() ||
1367               in_regs[i].first() == out_regs[c_arg].first()) {
1368             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1369           } else {
1370             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1371           }
1372         }
1373       } else if (in_sig_bt[i] == T_VOID) {
1374         arg_order.push(i);
1375         arg_order.push(c_arg);
1376       } else {
1377         if (out_regs[c_arg].first()->is_stack() ||
1378             in_regs[i].first() == out_regs[c_arg].first()) {
1379           arg_order.push(i);
1380           arg_order.push(c_arg);
1381         } else {
1382           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1383         }
1384       }
1385     }
1386     // Break any cycles in the register moves and emit the in the
1387     // proper order.
1388     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1389     for (int i = 0; i < stores->length(); i++) {
1390       arg_order.push(stores->at(i)->src_index());
1391       arg_order.push(stores->at(i)->dst_index());
1392     }
1393  }
1394 
1395   // Collected all the move operations
1396   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1397     if (src.first() == dst.first()) return;
1398     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1399   }
1400 
1401   // Walk the edges breaking cycles between moves.  The result list
1402   // can be walked in order to produce the proper set of loads
1403   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1404     // Record which moves kill which values
1405     GrowableArray<MoveOperation*> killer;
1406     for (int i = 0; i < edges.length(); i++) {
1407       MoveOperation* s = edges.at(i);
1408       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1409       killer.at_put_grow(s->dst_id(), s, NULL);
1410     }
1411     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1412            "make sure temp isn't in the registers that are killed");
1413 
1414     // create links between loads and stores
1415     for (int i = 0; i < edges.length(); i++) {
1416       edges.at(i)->link(killer);
1417     }
1418 
1419     // at this point, all the move operations are chained together
1420     // in a doubly linked list.  Processing it backwards finds
1421     // the beginning of the chain, forwards finds the end.  If there's
1422     // a cycle it can be broken at any point,  so pick an edge and walk
1423     // backward until the list ends or we end where we started.
1424     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1425     for (int e = 0; e < edges.length(); e++) {
1426       MoveOperation* s = edges.at(e);
1427       if (!s->is_processed()) {
1428         MoveOperation* start = s;
1429         // search for the beginning of the chain or cycle
1430         while (start->prev() != NULL && start->prev() != s) {
1431           start = start->prev();
1432         }
1433         if (start->prev() == s) {
1434           start->break_cycle(temp_register);
1435         }
1436         // walk the chain forward inserting to store list
1437         while (start != NULL) {
1438           stores->append(start);
1439           start->set_processed();
1440           start = start->next();
1441         }
1442       }
1443     }
1444     return stores;
1445   }
1446 };
1447 
1448 static void verify_oop_args(MacroAssembler* masm,
1449                             const methodHandle& method,
1450                             const BasicType* sig_bt,
1451                             const VMRegPair* regs) {
1452   Register temp_reg = rbx;  // not part of any compiled calling seq
1453   if (VerifyOops) {
1454     for (int i = 0; i < method->size_of_parameters(); i++) {
1455       if (is_reference_type(sig_bt[i])) {
1456         VMReg r = regs[i].first();
1457         assert(r->is_valid(), "bad oop arg");
1458         if (r->is_stack()) {
1459           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1460           __ verify_oop(temp_reg);
1461         } else {
1462           __ verify_oop(r->as_Register());
1463         }
1464       }
1465     }
1466   }
1467 }
1468 
1469 static void gen_special_dispatch(MacroAssembler* masm,
1470                                  const methodHandle& method,
1471                                  const BasicType* sig_bt,
1472                                  const VMRegPair* regs) {
1473   verify_oop_args(masm, method, sig_bt, regs);
1474   vmIntrinsics::ID iid = method->intrinsic_id();
1475 
1476   // Now write the args into the outgoing interpreter space
1477   bool     has_receiver   = false;
1478   Register receiver_reg   = noreg;
1479   int      member_arg_pos = -1;
1480   Register member_reg     = noreg;
1481   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1482   if (ref_kind != 0) {
1483     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1484     member_reg = rbx;  // known to be free at this point
1485     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1486   } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1487     has_receiver = true;
1488   } else {
1489     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1490   }
1491 
1492   if (member_reg != noreg) {
1493     // Load the member_arg into register, if necessary.
1494     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1495     VMReg r = regs[member_arg_pos].first();
1496     if (r->is_stack()) {
1497       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1498     } else {
1499       // no data motion is needed
1500       member_reg = r->as_Register();
1501     }
1502   }
1503 
1504   if (has_receiver) {
1505     // Make sure the receiver is loaded into a register.
1506     assert(method->size_of_parameters() > 0, "oob");
1507     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1508     VMReg r = regs[0].first();
1509     assert(r->is_valid(), "bad receiver arg");
1510     if (r->is_stack()) {
1511       // Porting note:  This assumes that compiled calling conventions always
1512       // pass the receiver oop in a register.  If this is not true on some
1513       // platform, pick a temp and load the receiver from stack.
1514       fatal("receiver always in a register");
1515       receiver_reg = j_rarg0;  // known to be free at this point
1516       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1517     } else {
1518       // no data motion is needed
1519       receiver_reg = r->as_Register();
1520     }
1521   }
1522 
1523   // Figure out which address we are really jumping to:
1524   MethodHandles::generate_method_handle_dispatch(masm, iid,
1525                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1526 }
1527 
1528 // ---------------------------------------------------------------------------
1529 // Generate a native wrapper for a given method.  The method takes arguments
1530 // in the Java compiled code convention, marshals them to the native
1531 // convention (handlizes oops, etc), transitions to native, makes the call,
1532 // returns to java state (possibly blocking), unhandlizes any result and
1533 // returns.
1534 //
1535 // Critical native functions are a shorthand for the use of
1536 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1537 // functions.  The wrapper is expected to unpack the arguments before
1538 // passing them to the callee. Critical native functions leave the state _in_Java,
1539 // since they cannot stop for GC.
1540 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1541 // block and the check for pending exceptions it's impossible for them
1542 // to be thrown.
1543 //
1544 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1545                                                 const methodHandle& method,
1546                                                 int compile_id,
1547                                                 BasicType* in_sig_bt,
1548                                                 VMRegPair* in_regs,
1549                                                 BasicType ret_type,
1550                                                 address critical_entry) {
1551   if (method->is_method_handle_intrinsic()) {
1552     vmIntrinsics::ID iid = method->intrinsic_id();
1553     intptr_t start = (intptr_t)__ pc();
1554     int vep_offset = ((intptr_t)__ pc()) - start;
1555     gen_special_dispatch(masm,
1556                          method,
1557                          in_sig_bt,
1558                          in_regs);
1559     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1560     __ flush();
1561     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1562     return nmethod::new_native_nmethod(method,
1563                                        compile_id,
1564                                        masm->code(),
1565                                        vep_offset,
1566                                        frame_complete,
1567                                        stack_slots / VMRegImpl::slots_per_word,
1568                                        in_ByteSize(-1),
1569                                        in_ByteSize(-1),
1570                                        (OopMapSet*)NULL);
1571   }
1572   bool is_critical_native = true;
1573   address native_func = critical_entry;
1574   if (native_func == NULL) {
1575     native_func = method->native_function();
1576     is_critical_native = false;
1577   }
1578   assert(native_func != NULL, "must have function");
1579 
1580   // An OopMap for lock (and class if static)
1581   OopMapSet *oop_maps = new OopMapSet();
1582   intptr_t start = (intptr_t)__ pc();
1583 
1584   // We have received a description of where all the java arg are located
1585   // on entry to the wrapper. We need to convert these args to where
1586   // the jni function will expect them. To figure out where they go
1587   // we convert the java signature to a C signature by inserting
1588   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1589 
1590   const int total_in_args = method->size_of_parameters();
1591   int total_c_args = total_in_args;
1592   if (!is_critical_native) {
1593     total_c_args += 1;
1594     if (method->is_static()) {
1595       total_c_args++;
1596     }
1597   } else {
1598     for (int i = 0; i < total_in_args; i++) {
1599       if (in_sig_bt[i] == T_ARRAY) {
1600         total_c_args++;
1601       }
1602     }
1603   }
1604 
1605   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1606   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1607   BasicType* in_elem_bt = NULL;
1608 
1609   int argc = 0;
1610   if (!is_critical_native) {
1611     out_sig_bt[argc++] = T_ADDRESS;
1612     if (method->is_static()) {
1613       out_sig_bt[argc++] = T_OBJECT;
1614     }
1615 
1616     for (int i = 0; i < total_in_args ; i++ ) {
1617       out_sig_bt[argc++] = in_sig_bt[i];
1618     }
1619   } else {
1620     in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
1621     SignatureStream ss(method->signature());
1622     for (int i = 0; i < total_in_args ; i++ ) {
1623       if (in_sig_bt[i] == T_ARRAY) {
1624         // Arrays are passed as int, elem* pair
1625         out_sig_bt[argc++] = T_INT;
1626         out_sig_bt[argc++] = T_ADDRESS;
1627         ss.skip_array_prefix(1);  // skip one '['
1628         assert(ss.is_primitive(), "primitive type expected");
1629         in_elem_bt[i] = ss.type();
1630       } else {
1631         out_sig_bt[argc++] = in_sig_bt[i];
1632         in_elem_bt[i] = T_VOID;
1633       }
1634       if (in_sig_bt[i] != T_VOID) {
1635         assert(in_sig_bt[i] == ss.type() ||
1636                in_sig_bt[i] == T_ARRAY, "must match");
1637         ss.next();
1638       }
1639     }
1640   }
1641 
1642   // Now figure out where the args must be stored and how much stack space
1643   // they require.
1644   int out_arg_slots;
1645   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1646 
1647   // Compute framesize for the wrapper.  We need to handlize all oops in
1648   // incoming registers
1649 
1650   // Calculate the total number of stack slots we will need.
1651 
1652   // First count the abi requirement plus all of the outgoing args
1653   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1654 
1655   // Now the space for the inbound oop handle area
1656   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1657   if (is_critical_native) {
1658     // Critical natives may have to call out so they need a save area
1659     // for register arguments.
1660     int double_slots = 0;
1661     int single_slots = 0;
1662     for ( int i = 0; i < total_in_args; i++) {
1663       if (in_regs[i].first()->is_Register()) {
1664         const Register reg = in_regs[i].first()->as_Register();
1665         switch (in_sig_bt[i]) {
1666           case T_BOOLEAN:
1667           case T_BYTE:
1668           case T_SHORT:
1669           case T_CHAR:
1670           case T_INT:  single_slots++; break;
1671           case T_ARRAY:  // specific to LP64 (7145024)
1672           case T_LONG: double_slots++; break;
1673           default:  ShouldNotReachHere();
1674         }
1675       } else if (in_regs[i].first()->is_XMMRegister()) {
1676         switch (in_sig_bt[i]) {
1677           case T_FLOAT:  single_slots++; break;
1678           case T_DOUBLE: double_slots++; break;
1679           default:  ShouldNotReachHere();
1680         }
1681       } else if (in_regs[i].first()->is_FloatRegister()) {
1682         ShouldNotReachHere();
1683       }
1684     }
1685     total_save_slots = double_slots * 2 + single_slots;
1686     // align the save area
1687     if (double_slots != 0) {
1688       stack_slots = align_up(stack_slots, 2);
1689     }
1690   }
1691 
1692   int oop_handle_offset = stack_slots;
1693   stack_slots += total_save_slots;
1694 
1695   // Now any space we need for handlizing a klass if static method
1696 
1697   int klass_slot_offset = 0;
1698   int klass_offset = -1;
1699   int lock_slot_offset = 0;
1700   bool is_static = false;
1701 
1702   if (method->is_static()) {
1703     klass_slot_offset = stack_slots;
1704     stack_slots += VMRegImpl::slots_per_word;
1705     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1706     is_static = true;
1707   }
1708 
1709   // Plus a lock if needed
1710 
1711   if (method->is_synchronized()) {
1712     lock_slot_offset = stack_slots;
1713     stack_slots += VMRegImpl::slots_per_word;
1714   }
1715 
1716   // Now a place (+2) to save return values or temp during shuffling
1717   // + 4 for return address (which we own) and saved rbp
1718   stack_slots += 6;
1719 
1720   // Ok The space we have allocated will look like:
1721   //
1722   //
1723   // FP-> |                     |
1724   //      |---------------------|
1725   //      | 2 slots for moves   |
1726   //      |---------------------|
1727   //      | lock box (if sync)  |
1728   //      |---------------------| <- lock_slot_offset
1729   //      | klass (if static)   |
1730   //      |---------------------| <- klass_slot_offset
1731   //      | oopHandle area      |
1732   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1733   //      | outbound memory     |
1734   //      | based arguments     |
1735   //      |                     |
1736   //      |---------------------|
1737   //      |                     |
1738   // SP-> | out_preserved_slots |
1739   //
1740   //
1741 
1742 
1743   // Now compute actual number of stack words we need rounding to make
1744   // stack properly aligned.
1745   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1746 
1747   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1748 
1749   // First thing make an ic check to see if we should even be here
1750 
1751   // We are free to use all registers as temps without saving them and
1752   // restoring them except rbp. rbp is the only callee save register
1753   // as far as the interpreter and the compiler(s) are concerned.
1754 
1755 
1756   const Register ic_reg = rax;
1757   const Register receiver = j_rarg0;
1758 
1759   Label hit;
1760   Label exception_pending;
1761 
1762   assert_different_registers(ic_reg, receiver, rscratch1);
1763   __ verify_oop(receiver);
1764   __ load_klass(rscratch1, receiver, rscratch2);
1765   __ cmpq(ic_reg, rscratch1);
1766   __ jcc(Assembler::equal, hit);
1767 
1768   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1769 
1770   // Verified entry point must be aligned
1771   __ align(8);
1772 
1773   __ bind(hit);
1774 
1775   int vep_offset = ((intptr_t)__ pc()) - start;
1776 
1777   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1778     Label L_skip_barrier;
1779     Register klass = r10;
1780     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1781     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1782 
1783     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1784 
1785     __ bind(L_skip_barrier);
1786   }
1787 
1788 #ifdef COMPILER1
1789   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1790   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1791     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1792   }
1793 #endif // COMPILER1
1794 
1795   // The instruction at the verified entry point must be 5 bytes or longer
1796   // because it can be patched on the fly by make_non_entrant. The stack bang
1797   // instruction fits that requirement.
1798 
1799   // Generate stack overflow check
1800   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1801 
1802   // Generate a new frame for the wrapper.
1803   __ enter();
1804   // -2 because return address is already present and so is saved rbp
1805   __ subptr(rsp, stack_size - 2*wordSize);
1806 
1807   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1808   bs->nmethod_entry_barrier(masm);
1809 
1810   // Frame is now completed as far as size and linkage.
1811   int frame_complete = ((intptr_t)__ pc()) - start;
1812 
1813     if (UseRTMLocking) {
1814       // Abort RTM transaction before calling JNI
1815       // because critical section will be large and will be
1816       // aborted anyway. Also nmethod could be deoptimized.
1817       __ xabort(0);
1818     }
1819 
1820 #ifdef ASSERT
1821     {
1822       Label L;
1823       __ mov(rax, rsp);
1824       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1825       __ cmpptr(rax, rsp);
1826       __ jcc(Assembler::equal, L);
1827       __ stop("improperly aligned stack");
1828       __ bind(L);
1829     }
1830 #endif /* ASSERT */
1831 
1832 
1833   // We use r14 as the oop handle for the receiver/klass
1834   // It is callee save so it survives the call to native
1835 
1836   const Register oop_handle_reg = r14;
1837 
1838   //
1839   // We immediately shuffle the arguments so that any vm call we have to
1840   // make from here on out (sync slow path, jvmti, etc.) we will have
1841   // captured the oops from our caller and have a valid oopMap for
1842   // them.
1843 
1844   // -----------------
1845   // The Grand Shuffle
1846 
1847   // The Java calling convention is either equal (linux) or denser (win64) than the
1848   // c calling convention. However the because of the jni_env argument the c calling
1849   // convention always has at least one more (and two for static) arguments than Java.
1850   // Therefore if we move the args from java -> c backwards then we will never have
1851   // a register->register conflict and we don't have to build a dependency graph
1852   // and figure out how to break any cycles.
1853   //
1854 
1855   // Record esp-based slot for receiver on stack for non-static methods
1856   int receiver_offset = -1;
1857 
1858   // This is a trick. We double the stack slots so we can claim
1859   // the oops in the caller's frame. Since we are sure to have
1860   // more args than the caller doubling is enough to make
1861   // sure we can capture all the incoming oop args from the
1862   // caller.
1863   //
1864   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1865 
1866   // Mark location of rbp (someday)
1867   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1868 
1869   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1870   // All inbound args are referenced based on rbp and all outbound args via rsp.
1871 
1872 
1873 #ifdef ASSERT
1874   bool reg_destroyed[RegisterImpl::number_of_registers];
1875   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1876   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1877     reg_destroyed[r] = false;
1878   }
1879   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1880     freg_destroyed[f] = false;
1881   }
1882 
1883 #endif /* ASSERT */
1884 
1885   // This may iterate in two different directions depending on the
1886   // kind of native it is.  The reason is that for regular JNI natives
1887   // the incoming and outgoing registers are offset upwards and for
1888   // critical natives they are offset down.
1889   GrowableArray<int> arg_order(2 * total_in_args);
1890 
1891   VMRegPair tmp_vmreg;
1892   tmp_vmreg.set2(rbx->as_VMReg());
1893 
1894   if (!is_critical_native) {
1895     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1896       arg_order.push(i);
1897       arg_order.push(c_arg);
1898     }
1899   } else {
1900     // Compute a valid move order, using tmp_vmreg to break any cycles
1901     ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
1902   }
1903 
1904   int temploc = -1;
1905   for (int ai = 0; ai < arg_order.length(); ai += 2) {
1906     int i = arg_order.at(ai);
1907     int c_arg = arg_order.at(ai + 1);
1908     __ block_comment(err_msg("move %d -> %d", i, c_arg));
1909     if (c_arg == -1) {
1910       assert(is_critical_native, "should only be required for critical natives");
1911       // This arg needs to be moved to a temporary
1912       __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
1913       in_regs[i] = tmp_vmreg;
1914       temploc = i;
1915       continue;
1916     } else if (i == -1) {
1917       assert(is_critical_native, "should only be required for critical natives");
1918       // Read from the temporary location
1919       assert(temploc != -1, "must be valid");
1920       i = temploc;
1921       temploc = -1;
1922     }
1923 #ifdef ASSERT
1924     if (in_regs[i].first()->is_Register()) {
1925       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1926     } else if (in_regs[i].first()->is_XMMRegister()) {
1927       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1928     }
1929     if (out_regs[c_arg].first()->is_Register()) {
1930       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1931     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1932       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1933     }
1934 #endif /* ASSERT */
1935     switch (in_sig_bt[i]) {
1936       case T_ARRAY:
1937         if (is_critical_native) {
1938           unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
1939           c_arg++;
1940 #ifdef ASSERT
1941           if (out_regs[c_arg].first()->is_Register()) {
1942             reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1943           } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1944             freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1945           }
1946 #endif
1947           break;
1948         }
1949       case T_OBJECT:
1950         assert(!is_critical_native, "no oop arguments");
1951         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1952                     ((i == 0) && (!is_static)),
1953                     &receiver_offset);
1954         break;
1955       case T_VOID:
1956         break;
1957 
1958       case T_FLOAT:
1959         __ float_move(in_regs[i], out_regs[c_arg]);
1960           break;
1961 
1962       case T_DOUBLE:
1963         assert( i + 1 < total_in_args &&
1964                 in_sig_bt[i + 1] == T_VOID &&
1965                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1966         __ double_move(in_regs[i], out_regs[c_arg]);
1967         break;
1968 
1969       case T_LONG :
1970         __ long_move(in_regs[i], out_regs[c_arg]);
1971         break;
1972 
1973       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1974 
1975       default:
1976         __ move32_64(in_regs[i], out_regs[c_arg]);
1977     }
1978   }
1979 
1980   int c_arg;
1981 
1982   // Pre-load a static method's oop into r14.  Used both by locking code and
1983   // the normal JNI call code.
1984   if (!is_critical_native) {
1985     // point c_arg at the first arg that is already loaded in case we
1986     // need to spill before we call out
1987     c_arg = total_c_args - total_in_args;
1988 
1989     if (method->is_static()) {
1990 
1991       //  load oop into a register
1992       __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1993 
1994       // Now handlize the static class mirror it's known not-null.
1995       __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1996       map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1997 
1998       // Now get the handle
1999       __ lea(oop_handle_reg, Address(rsp, klass_offset));
2000       // store the klass handle as second argument
2001       __ movptr(c_rarg1, oop_handle_reg);
2002       // and protect the arg if we must spill
2003       c_arg--;
2004     }
2005   } else {
2006     // For JNI critical methods we need to save all registers in save_args.
2007     c_arg = 0;
2008   }
2009 
2010   // Change state to native (we save the return address in the thread, since it might not
2011   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2012   // points into the right code segment. It does not have to be the correct return pc.
2013   // We use the same pc/oopMap repeatedly when we call out
2014 
2015   intptr_t the_pc = (intptr_t) __ pc();
2016   oop_maps->add_gc_map(the_pc - start, map);
2017 
2018   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
2019 
2020 
2021   // We have all of the arguments setup at this point. We must not touch any register
2022   // argument registers at this point (what if we save/restore them there are no oop?
2023 
2024   {
2025     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2026     // protect the args we've loaded
2027     save_args(masm, total_c_args, c_arg, out_regs);
2028     __ mov_metadata(c_rarg1, method());
2029     __ call_VM_leaf(
2030       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2031       r15_thread, c_rarg1);
2032     restore_args(masm, total_c_args, c_arg, out_regs);
2033   }
2034 
2035   // RedefineClasses() tracing support for obsolete method entry
2036   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2037     // protect the args we've loaded
2038     save_args(masm, total_c_args, c_arg, out_regs);
2039     __ mov_metadata(c_rarg1, method());
2040     __ call_VM_leaf(
2041       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2042       r15_thread, c_rarg1);
2043     restore_args(masm, total_c_args, c_arg, out_regs);
2044   }
2045 
2046   // Lock a synchronized method
2047 
2048   // Register definitions used by locking and unlocking
2049 
2050   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2051   const Register obj_reg  = rbx;  // Will contain the oop
2052   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2053   const Register old_hdr  = r13;  // value of old header at unlock time
2054 
2055   Label slow_path_lock;
2056   Label lock_done;
2057 
2058   if (method->is_synchronized()) {
2059     assert(!is_critical_native, "unhandled");
2060 
2061 
2062     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2063 
2064     // Get the handle (the 2nd argument)
2065     __ mov(oop_handle_reg, c_rarg1);
2066 
2067     // Get address of the box
2068 
2069     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2070 
2071     // Load the oop from the handle
2072     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2073 
2074     if (LockingMode == LM_MONITOR) {
2075       __ jmp(slow_path_lock);
2076     } else if (LockingMode == LM_LEGACY) {
2077       if (UseBiasedLocking) {
2078         __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, rscratch2, false, lock_done, &slow_path_lock);
2079       }
2080 
2081       // Load immediate 1 into swap_reg %rax
2082       __ movl(swap_reg, 1);
2083 
2084       // Load (object->mark() | 1) into swap_reg %rax
2085       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2086 
2087       // Save (object->mark() | 1) into BasicLock's displaced header
2088       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2089 
2090       // src -> dest iff dest == rax else rax <- dest
2091       __ lock();
2092       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2093       __ jcc(Assembler::equal, lock_done);
2094 
2095       // Hmm should this move to the slow path code area???
2096 
2097       // Test if the oopMark is an obvious stack pointer, i.e.,
2098       //  1) (mark & 3) == 0, and
2099       //  2) rsp <= mark < mark + os::pagesize()
2100       // These 3 tests can be done by evaluating the following
2101       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2102       // assuming both stack pointer and pagesize have their
2103       // least significant 2 bits clear.
2104       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2105 
2106       __ subptr(swap_reg, rsp);
2107       __ andptr(swap_reg, 3 - os::vm_page_size());
2108 
2109       // Save the test result, for recursive case, the result is zero
2110       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2111       __ jcc(Assembler::notEqual, slow_path_lock);
2112     } else {
2113       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2114       __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2115     }
2116 
2117     // Slow path will re-enter here
2118 
2119     __ bind(lock_done);
2120   }
2121 
2122   // Finally just about ready to make the JNI call
2123 
2124   // get JNIEnv* which is first argument to native
2125   if (!is_critical_native) {
2126     __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2127 
2128     // Now set thread in native
2129     __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2130   }
2131 
2132   __ call(RuntimeAddress(native_func));
2133 
2134   // Verify or restore cpu control state after JNI call
2135   __ restore_cpu_control_state_after_jni();
2136 
2137   // Unpack native results.
2138   switch (ret_type) {
2139   case T_BOOLEAN: __ c2bool(rax);            break;
2140   case T_CHAR   : __ movzwl(rax, rax);      break;
2141   case T_BYTE   : __ sign_extend_byte (rax); break;
2142   case T_SHORT  : __ sign_extend_short(rax); break;
2143   case T_INT    : /* nothing to do */        break;
2144   case T_DOUBLE :
2145   case T_FLOAT  :
2146     // Result is in xmm0 we'll save as needed
2147     break;
2148   case T_ARRAY:                 // Really a handle
2149   case T_OBJECT:                // Really a handle
2150       break; // can't de-handlize until after safepoint check
2151   case T_VOID: break;
2152   case T_LONG: break;
2153   default       : ShouldNotReachHere();
2154   }
2155 
2156   Label after_transition;
2157 
2158   // If this is a critical native, check for a safepoint or suspend request after the call.
2159   // If a safepoint is needed, transition to native, then to native_trans to handle
2160   // safepoints like the native methods that are not critical natives.
2161   if (is_critical_native) {
2162     Label needs_safepoint;
2163     __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */);
2164     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2165     __ jcc(Assembler::equal, after_transition);
2166     __ bind(needs_safepoint);
2167   }
2168 
2169   // Switch thread to "native transition" state before reading the synchronization state.
2170   // This additional state is necessary because reading and testing the synchronization
2171   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2172   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2173   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2174   //     Thread A is resumed to finish this native method, but doesn't block here since it
2175   //     didn't see any synchronization is progress, and escapes.
2176   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2177 
2178   // Force this write out before the read below
2179   __ membar(Assembler::Membar_mask_bits(
2180               Assembler::LoadLoad | Assembler::LoadStore |
2181               Assembler::StoreLoad | Assembler::StoreStore));
2182 
2183   // check for safepoint operation in progress and/or pending suspend requests
2184   {
2185     Label Continue;
2186     Label slow_path;
2187 
2188     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2189 
2190     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2191     __ jcc(Assembler::equal, Continue);
2192     __ bind(slow_path);
2193 
2194     // Don't use call_VM as it will see a possible pending exception and forward it
2195     // and never return here preventing us from clearing _last_native_pc down below.
2196     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2197     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2198     // by hand.
2199     //
2200     __ vzeroupper();
2201     save_native_result(masm, ret_type, stack_slots);
2202     __ mov(c_rarg0, r15_thread);
2203     __ mov(r12, rsp); // remember sp
2204     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2205     __ andptr(rsp, -16); // align stack as required by ABI
2206     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2207     __ mov(rsp, r12); // restore sp
2208     __ reinit_heapbase();
2209     // Restore any method result value
2210     restore_native_result(masm, ret_type, stack_slots);
2211     __ bind(Continue);
2212   }
2213 
2214   // change thread state
2215   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2216   __ bind(after_transition);
2217 
2218   Label reguard;
2219   Label reguard_done;
2220   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2221   __ jcc(Assembler::equal, reguard);
2222   __ bind(reguard_done);
2223 
2224   // native result if any is live
2225 
2226   // Unlock
2227   Label unlock_done;
2228   Label slow_path_unlock;
2229   if (method->is_synchronized()) {
2230 
2231     // Get locked oop from the handle we passed to jni
2232     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2233 
2234     Label done;
2235 
2236     if (UseBiasedLocking) {
2237       __ biased_locking_exit(obj_reg, old_hdr, done);
2238     }
2239 
2240     if (LockingMode == LM_LEGACY) {
2241       // Simple recursive lock?
2242 
2243       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2244       __ jcc(Assembler::equal, done);
2245     }
2246 
2247     // Must save rax if if it is live now because cmpxchg must use it
2248     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2249       save_native_result(masm, ret_type, stack_slots);
2250     }
2251 
2252     if (LockingMode == LM_MONITOR) {
2253       __ jmp(slow_path_unlock);
2254     } else if (LockingMode == LM_LEGACY) {
2255       // get address of the stack lock
2256       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2257       //  get old displaced header
2258       __ movptr(old_hdr, Address(rax, 0));
2259 
2260       // Atomic swap old header if oop still contains the stack lock
2261       __ lock();
2262       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2263       __ jcc(Assembler::notEqual, slow_path_unlock);
2264     } else {
2265       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2266       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2267     }
2268 
2269     // slow path re-enters here
2270     __ bind(unlock_done);
2271     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2272       restore_native_result(masm, ret_type, stack_slots);
2273     }
2274 
2275     __ bind(done);
2276 
2277   }
2278   {
2279     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2280     save_native_result(masm, ret_type, stack_slots);
2281     __ mov_metadata(c_rarg1, method());
2282     __ call_VM_leaf(
2283          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2284          r15_thread, c_rarg1);
2285     restore_native_result(masm, ret_type, stack_slots);
2286   }
2287 
2288   __ reset_last_Java_frame(false);
2289 
2290   // Unbox oop result, e.g. JNIHandles::resolve value.
2291   if (is_reference_type(ret_type)) {
2292     __ resolve_jobject(rax /* value */,
2293                        r15_thread /* thread */,
2294                        rcx /* tmp */);
2295   }
2296 
2297   if (CheckJNICalls) {
2298     // clear_pending_jni_exception_check
2299     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2300   }
2301 
2302   if (!is_critical_native) {
2303     // reset handle block
2304     __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2305     __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2306   }
2307 
2308   // pop our frame
2309 
2310   __ leave();
2311 
2312   if (!is_critical_native) {
2313     // Any exception pending?
2314     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2315     __ jcc(Assembler::notEqual, exception_pending);
2316   }
2317 
2318   // Return
2319 
2320   __ ret(0);
2321 
2322   // Unexpected paths are out of line and go here
2323 
2324   if (!is_critical_native) {
2325     // forward the exception
2326     __ bind(exception_pending);
2327 
2328     // and forward the exception
2329     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2330   }
2331 
2332   // Slow path locking & unlocking
2333   if (method->is_synchronized()) {
2334 
2335     // BEGIN Slow path lock
2336     __ bind(slow_path_lock);
2337 
2338     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2339     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2340 
2341     // protect the args we've loaded
2342     save_args(masm, total_c_args, c_arg, out_regs);
2343 
2344     __ mov(c_rarg0, obj_reg);
2345     __ mov(c_rarg1, lock_reg);
2346     __ mov(c_rarg2, r15_thread);
2347 
2348     // Not a leaf but we have last_Java_frame setup as we want
2349     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2350     restore_args(masm, total_c_args, c_arg, out_regs);
2351 
2352 #ifdef ASSERT
2353     { Label L;
2354     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2355     __ jcc(Assembler::equal, L);
2356     __ stop("no pending exception allowed on exit from monitorenter");
2357     __ bind(L);
2358     }
2359 #endif
2360     __ jmp(lock_done);
2361 
2362     // END Slow path lock
2363 
2364     // BEGIN Slow path unlock
2365     __ bind(slow_path_unlock);
2366 
2367     // If we haven't already saved the native result we must save it now as xmm registers
2368     // are still exposed.
2369     __ vzeroupper();
2370     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2371       save_native_result(masm, ret_type, stack_slots);
2372     }
2373 
2374     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2375 
2376     __ mov(c_rarg0, obj_reg);
2377     __ mov(c_rarg2, r15_thread);
2378     __ mov(r12, rsp); // remember sp
2379     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2380     __ andptr(rsp, -16); // align stack as required by ABI
2381 
2382     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2383     // NOTE that obj_reg == rbx currently
2384     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2385     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2386 
2387     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2388     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2389     __ mov(rsp, r12); // restore sp
2390     __ reinit_heapbase();
2391 #ifdef ASSERT
2392     {
2393       Label L;
2394       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2395       __ jcc(Assembler::equal, L);
2396       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2397       __ bind(L);
2398     }
2399 #endif /* ASSERT */
2400 
2401     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2402 
2403     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2404       restore_native_result(masm, ret_type, stack_slots);
2405     }
2406     __ jmp(unlock_done);
2407 
2408     // END Slow path unlock
2409 
2410   } // synchronized
2411 
2412   // SLOW PATH Reguard the stack if needed
2413 
2414   __ bind(reguard);
2415   __ vzeroupper();
2416   save_native_result(masm, ret_type, stack_slots);
2417   __ mov(r12, rsp); // remember sp
2418   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2419   __ andptr(rsp, -16); // align stack as required by ABI
2420   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2421   __ mov(rsp, r12); // restore sp
2422   __ reinit_heapbase();
2423   restore_native_result(masm, ret_type, stack_slots);
2424   // and continue
2425   __ jmp(reguard_done);
2426 
2427 
2428 
2429   __ flush();
2430 
2431   nmethod *nm = nmethod::new_native_nmethod(method,
2432                                             compile_id,
2433                                             masm->code(),
2434                                             vep_offset,
2435                                             frame_complete,
2436                                             stack_slots / VMRegImpl::slots_per_word,
2437                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2438                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2439                                             oop_maps);
2440 
2441   return nm;
2442 }
2443 
2444 // this function returns the adjust size (in number of words) to a c2i adapter
2445 // activation for use during deoptimization
2446 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2447   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2448 }
2449 
2450 
2451 uint SharedRuntime::out_preserve_stack_slots() {
2452   return 0;
2453 }
2454 
2455 
2456 // Number of stack slots between incoming argument block and the start of
2457 // a new frame.  The PROLOG must add this many slots to the stack.  The
2458 // EPILOG must remove this many slots.  amd64 needs two slots for
2459 // return address.
2460 uint SharedRuntime::in_preserve_stack_slots() {
2461   return 4 + 2 * VerifyStackAtCalls;
2462 }
2463 
2464 //------------------------------generate_deopt_blob----------------------------
2465 void SharedRuntime::generate_deopt_blob() {
2466   // Allocate space for the code
2467   ResourceMark rm;
2468   // Setup code generation tools
2469   int pad = 0;
2470   if (UseAVX > 2) {
2471     pad += 1024;
2472   }
2473 #if INCLUDE_JVMCI
2474   if (EnableJVMCI) {
2475     pad += 512; // Increase the buffer size when compiling for JVMCI
2476   }
2477 #endif
2478   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2479   MacroAssembler* masm = new MacroAssembler(&buffer);
2480   int frame_size_in_words;
2481   OopMap* map = NULL;
2482   OopMapSet *oop_maps = new OopMapSet();
2483 
2484   // -------------
2485   // This code enters when returning to a de-optimized nmethod.  A return
2486   // address has been pushed on the the stack, and return values are in
2487   // registers.
2488   // If we are doing a normal deopt then we were called from the patched
2489   // nmethod from the point we returned to the nmethod. So the return
2490   // address on the stack is wrong by NativeCall::instruction_size
2491   // We will adjust the value so it looks like we have the original return
2492   // address on the stack (like when we eagerly deoptimized).
2493   // In the case of an exception pending when deoptimizing, we enter
2494   // with a return address on the stack that points after the call we patched
2495   // into the exception handler. We have the following register state from,
2496   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2497   //    rax: exception oop
2498   //    rbx: exception handler
2499   //    rdx: throwing pc
2500   // So in this case we simply jam rdx into the useless return address and
2501   // the stack looks just like we want.
2502   //
2503   // At this point we need to de-opt.  We save the argument return
2504   // registers.  We call the first C routine, fetch_unroll_info().  This
2505   // routine captures the return values and returns a structure which
2506   // describes the current frame size and the sizes of all replacement frames.
2507   // The current frame is compiled code and may contain many inlined
2508   // functions, each with their own JVM state.  We pop the current frame, then
2509   // push all the new frames.  Then we call the C routine unpack_frames() to
2510   // populate these frames.  Finally unpack_frames() returns us the new target
2511   // address.  Notice that callee-save registers are BLOWN here; they have
2512   // already been captured in the vframeArray at the time the return PC was
2513   // patched.
2514   address start = __ pc();
2515   Label cont;
2516 
2517   // Prolog for non exception case!
2518 
2519   // Save everything in sight.
2520   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2521 
2522   // Normal deoptimization.  Save exec mode for unpack_frames.
2523   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2524   __ jmp(cont);
2525 
2526   int reexecute_offset = __ pc() - start;
2527 #if INCLUDE_JVMCI && !defined(COMPILER1)
2528   if (EnableJVMCI && UseJVMCICompiler) {
2529     // JVMCI does not use this kind of deoptimization
2530     __ should_not_reach_here();
2531   }
2532 #endif
2533 
2534   // Reexecute case
2535   // return address is the pc describes what bci to do re-execute at
2536 
2537   // No need to update map as each call to save_live_registers will produce identical oopmap
2538   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2539 
2540   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2541   __ jmp(cont);
2542 
2543 #if INCLUDE_JVMCI
2544   Label after_fetch_unroll_info_call;
2545   int implicit_exception_uncommon_trap_offset = 0;
2546   int uncommon_trap_offset = 0;
2547 
2548   if (EnableJVMCI) {
2549     implicit_exception_uncommon_trap_offset = __ pc() - start;
2550 
2551     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2552     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2553 
2554     uncommon_trap_offset = __ pc() - start;
2555 
2556     // Save everything in sight.
2557     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2558     // fetch_unroll_info needs to call last_java_frame()
2559     __ set_last_Java_frame(noreg, noreg, NULL);
2560 
2561     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2562     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2563 
2564     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2565     __ mov(c_rarg0, r15_thread);
2566     __ movl(c_rarg2, r14); // exec mode
2567     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2568     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2569 
2570     __ reset_last_Java_frame(false);
2571 
2572     __ jmp(after_fetch_unroll_info_call);
2573   } // EnableJVMCI
2574 #endif // INCLUDE_JVMCI
2575 
2576   int exception_offset = __ pc() - start;
2577 
2578   // Prolog for exception case
2579 
2580   // all registers are dead at this entry point, except for rax, and
2581   // rdx which contain the exception oop and exception pc
2582   // respectively.  Set them in TLS and fall thru to the
2583   // unpack_with_exception_in_tls entry point.
2584 
2585   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2586   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2587 
2588   int exception_in_tls_offset = __ pc() - start;
2589 
2590   // new implementation because exception oop is now passed in JavaThread
2591 
2592   // Prolog for exception case
2593   // All registers must be preserved because they might be used by LinearScan
2594   // Exceptiop oop and throwing PC are passed in JavaThread
2595   // tos: stack at point of call to method that threw the exception (i.e. only
2596   // args are on the stack, no return address)
2597 
2598   // make room on stack for the return address
2599   // It will be patched later with the throwing pc. The correct value is not
2600   // available now because loading it from memory would destroy registers.
2601   __ push(0);
2602 
2603   // Save everything in sight.
2604   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2605 
2606   // Now it is safe to overwrite any register
2607 
2608   // Deopt during an exception.  Save exec mode for unpack_frames.
2609   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2610 
2611   // load throwing pc from JavaThread and patch it as the return address
2612   // of the current frame. Then clear the field in JavaThread
2613 
2614   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2615   __ movptr(Address(rbp, wordSize), rdx);
2616   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2617 
2618 #ifdef ASSERT
2619   // verify that there is really an exception oop in JavaThread
2620   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2621   __ verify_oop(rax);
2622 
2623   // verify that there is no pending exception
2624   Label no_pending_exception;
2625   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2626   __ testptr(rax, rax);
2627   __ jcc(Assembler::zero, no_pending_exception);
2628   __ stop("must not have pending exception here");
2629   __ bind(no_pending_exception);
2630 #endif
2631 
2632   __ bind(cont);
2633 
2634   // Call C code.  Need thread and this frame, but NOT official VM entry
2635   // crud.  We cannot block on this call, no GC can happen.
2636   //
2637   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2638 
2639   // fetch_unroll_info needs to call last_java_frame().
2640 
2641   __ set_last_Java_frame(noreg, noreg, NULL);
2642 #ifdef ASSERT
2643   { Label L;
2644     __ cmpptr(Address(r15_thread,
2645                     JavaThread::last_Java_fp_offset()),
2646             (int32_t)0);
2647     __ jcc(Assembler::equal, L);
2648     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2649     __ bind(L);
2650   }
2651 #endif // ASSERT
2652   __ mov(c_rarg0, r15_thread);
2653   __ movl(c_rarg1, r14); // exec_mode
2654   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2655 
2656   // Need to have an oopmap that tells fetch_unroll_info where to
2657   // find any register it might need.
2658   oop_maps->add_gc_map(__ pc() - start, map);
2659 
2660   __ reset_last_Java_frame(false);
2661 
2662 #if INCLUDE_JVMCI
2663   if (EnableJVMCI) {
2664     __ bind(after_fetch_unroll_info_call);
2665   }
2666 #endif
2667 
2668   // Load UnrollBlock* into rdi
2669   __ mov(rdi, rax);
2670 
2671   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2672    Label noException;
2673   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2674   __ jcc(Assembler::notEqual, noException);
2675   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2676   // QQQ this is useless it was NULL above
2677   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2678   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2679   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2680 
2681   __ verify_oop(rax);
2682 
2683   // Overwrite the result registers with the exception results.
2684   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2685   // I think this is useless
2686   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2687 
2688   __ bind(noException);
2689 
2690   // Only register save data is on the stack.
2691   // Now restore the result registers.  Everything else is either dead
2692   // or captured in the vframeArray.
2693   RegisterSaver::restore_result_registers(masm);
2694 
2695   // All of the register save area has been popped of the stack. Only the
2696   // return address remains.
2697 
2698   // Pop all the frames we must move/replace.
2699   //
2700   // Frame picture (youngest to oldest)
2701   // 1: self-frame (no frame link)
2702   // 2: deopting frame  (no frame link)
2703   // 3: caller of deopting frame (could be compiled/interpreted).
2704   //
2705   // Note: by leaving the return address of self-frame on the stack
2706   // and using the size of frame 2 to adjust the stack
2707   // when we are done the return to frame 3 will still be on the stack.
2708 
2709   // Pop deoptimized frame
2710   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2711   __ addptr(rsp, rcx);
2712 
2713   // rsp should be pointing at the return address to the caller (3)
2714 
2715   // Pick up the initial fp we should save
2716   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2717   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2718 
2719 #ifdef ASSERT
2720   // Compilers generate code that bang the stack by as much as the
2721   // interpreter would need. So this stack banging should never
2722   // trigger a fault. Verify that it does not on non product builds.
2723   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2724   __ bang_stack_size(rbx, rcx);
2725 #endif
2726 
2727   // Load address of array of frame pcs into rcx
2728   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2729 
2730   // Trash the old pc
2731   __ addptr(rsp, wordSize);
2732 
2733   // Load address of array of frame sizes into rsi
2734   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2735 
2736   // Load counter into rdx
2737   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2738 
2739   // Now adjust the caller's stack to make up for the extra locals
2740   // but record the original sp so that we can save it in the skeletal interpreter
2741   // frame and the stack walking of interpreter_sender will get the unextended sp
2742   // value and not the "real" sp value.
2743 
2744   const Register sender_sp = r8;
2745 
2746   __ mov(sender_sp, rsp);
2747   __ movl(rbx, Address(rdi,
2748                        Deoptimization::UnrollBlock::
2749                        caller_adjustment_offset_in_bytes()));
2750   __ subptr(rsp, rbx);
2751 
2752   // Push interpreter frames in a loop
2753   Label loop;
2754   __ bind(loop);
2755   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2756   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2757   __ pushptr(Address(rcx, 0));          // Save return address
2758   __ enter();                           // Save old & set new ebp
2759   __ subptr(rsp, rbx);                  // Prolog
2760   // This value is corrected by layout_activation_impl
2761   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2762   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2763   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2764   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2765   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2766   __ decrementl(rdx);                   // Decrement counter
2767   __ jcc(Assembler::notZero, loop);
2768   __ pushptr(Address(rcx, 0));          // Save final return address
2769 
2770   // Re-push self-frame
2771   __ enter();                           // Save old & set new ebp
2772 
2773   // Allocate a full sized register save area.
2774   // Return address and rbp are in place, so we allocate two less words.
2775   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2776 
2777   // Restore frame locals after moving the frame
2778   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2779   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2780 
2781   // Call C code.  Need thread but NOT official VM entry
2782   // crud.  We cannot block on this call, no GC can happen.  Call should
2783   // restore return values to their stack-slots with the new SP.
2784   //
2785   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2786 
2787   // Use rbp because the frames look interpreted now
2788   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2789   // Don't need the precise return PC here, just precise enough to point into this code blob.
2790   address the_pc = __ pc();
2791   __ set_last_Java_frame(noreg, rbp, the_pc);
2792 
2793   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2794   __ mov(c_rarg0, r15_thread);
2795   __ movl(c_rarg1, r14); // second arg: exec_mode
2796   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2797   // Revert SP alignment after call since we're going to do some SP relative addressing below
2798   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2799 
2800   // Set an oopmap for the call site
2801   // Use the same PC we used for the last java frame
2802   oop_maps->add_gc_map(the_pc - start,
2803                        new OopMap( frame_size_in_words, 0 ));
2804 
2805   // Clear fp AND pc
2806   __ reset_last_Java_frame(true);
2807 
2808   // Collect return values
2809   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2810   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2811   // I think this is useless (throwing pc?)
2812   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2813 
2814   // Pop self-frame.
2815   __ leave();                           // Epilog
2816 
2817   // Jump to interpreter
2818   __ ret(0);
2819 
2820   // Make sure all code is generated
2821   masm->flush();
2822 
2823   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2824   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2825 #if INCLUDE_JVMCI
2826   if (EnableJVMCI) {
2827     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2828     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2829   }
2830 #endif
2831 }
2832 
2833 #ifdef COMPILER2
2834 //------------------------------generate_uncommon_trap_blob--------------------
2835 void SharedRuntime::generate_uncommon_trap_blob() {
2836   // Allocate space for the code
2837   ResourceMark rm;
2838   // Setup code generation tools
2839   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2840   MacroAssembler* masm = new MacroAssembler(&buffer);
2841 
2842   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2843 
2844   address start = __ pc();
2845 
2846   if (UseRTMLocking) {
2847     // Abort RTM transaction before possible nmethod deoptimization.
2848     __ xabort(0);
2849   }
2850 
2851   // Push self-frame.  We get here with a return address on the
2852   // stack, so rsp is 8-byte aligned until we allocate our frame.
2853   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2854 
2855   // No callee saved registers. rbp is assumed implicitly saved
2856   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2857 
2858   // compiler left unloaded_class_index in j_rarg0 move to where the
2859   // runtime expects it.
2860   __ movl(c_rarg1, j_rarg0);
2861 
2862   __ set_last_Java_frame(noreg, noreg, NULL);
2863 
2864   // Call C code.  Need thread but NOT official VM entry
2865   // crud.  We cannot block on this call, no GC can happen.  Call should
2866   // capture callee-saved registers as well as return values.
2867   // Thread is in rdi already.
2868   //
2869   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2870 
2871   __ mov(c_rarg0, r15_thread);
2872   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2873   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2874 
2875   // Set an oopmap for the call site
2876   OopMapSet* oop_maps = new OopMapSet();
2877   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2878 
2879   // location of rbp is known implicitly by the frame sender code
2880 
2881   oop_maps->add_gc_map(__ pc() - start, map);
2882 
2883   __ reset_last_Java_frame(false);
2884 
2885   // Load UnrollBlock* into rdi
2886   __ mov(rdi, rax);
2887 
2888 #ifdef ASSERT
2889   { Label L;
2890     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2891             (int32_t)Deoptimization::Unpack_uncommon_trap);
2892     __ jcc(Assembler::equal, L);
2893     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2894     __ bind(L);
2895   }
2896 #endif
2897 
2898   // Pop all the frames we must move/replace.
2899   //
2900   // Frame picture (youngest to oldest)
2901   // 1: self-frame (no frame link)
2902   // 2: deopting frame  (no frame link)
2903   // 3: caller of deopting frame (could be compiled/interpreted).
2904 
2905   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2906   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2907 
2908   // Pop deoptimized frame (int)
2909   __ movl(rcx, Address(rdi,
2910                        Deoptimization::UnrollBlock::
2911                        size_of_deoptimized_frame_offset_in_bytes()));
2912   __ addptr(rsp, rcx);
2913 
2914   // rsp should be pointing at the return address to the caller (3)
2915 
2916   // Pick up the initial fp we should save
2917   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2918   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2919 
2920 #ifdef ASSERT
2921   // Compilers generate code that bang the stack by as much as the
2922   // interpreter would need. So this stack banging should never
2923   // trigger a fault. Verify that it does not on non product builds.
2924   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2925   __ bang_stack_size(rbx, rcx);
2926 #endif
2927 
2928   // Load address of array of frame pcs into rcx (address*)
2929   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2930 
2931   // Trash the return pc
2932   __ addptr(rsp, wordSize);
2933 
2934   // Load address of array of frame sizes into rsi (intptr_t*)
2935   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2936 
2937   // Counter
2938   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2939 
2940   // Now adjust the caller's stack to make up for the extra locals but
2941   // record the original sp so that we can save it in the skeletal
2942   // interpreter frame and the stack walking of interpreter_sender
2943   // will get the unextended sp value and not the "real" sp value.
2944 
2945   const Register sender_sp = r8;
2946 
2947   __ mov(sender_sp, rsp);
2948   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2949   __ subptr(rsp, rbx);
2950 
2951   // Push interpreter frames in a loop
2952   Label loop;
2953   __ bind(loop);
2954   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2955   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2956   __ pushptr(Address(rcx, 0));     // Save return address
2957   __ enter();                      // Save old & set new rbp
2958   __ subptr(rsp, rbx);             // Prolog
2959   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2960             sender_sp);            // Make it walkable
2961   // This value is corrected by layout_activation_impl
2962   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2963   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
2964   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
2965   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
2966   __ decrementl(rdx);              // Decrement counter
2967   __ jcc(Assembler::notZero, loop);
2968   __ pushptr(Address(rcx, 0));     // Save final return address
2969 
2970   // Re-push self-frame
2971   __ enter();                 // Save old & set new rbp
2972   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2973                               // Prolog
2974 
2975   // Use rbp because the frames look interpreted now
2976   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2977   // Don't need the precise return PC here, just precise enough to point into this code blob.
2978   address the_pc = __ pc();
2979   __ set_last_Java_frame(noreg, rbp, the_pc);
2980 
2981   // Call C code.  Need thread but NOT official VM entry
2982   // crud.  We cannot block on this call, no GC can happen.  Call should
2983   // restore return values to their stack-slots with the new SP.
2984   // Thread is in rdi already.
2985   //
2986   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2987 
2988   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2989   __ mov(c_rarg0, r15_thread);
2990   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2991   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2992 
2993   // Set an oopmap for the call site
2994   // Use the same PC we used for the last java frame
2995   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2996 
2997   // Clear fp AND pc
2998   __ reset_last_Java_frame(true);
2999 
3000   // Pop self-frame.
3001   __ leave();                 // Epilog
3002 
3003   // Jump to interpreter
3004   __ ret(0);
3005 
3006   // Make sure all code is generated
3007   masm->flush();
3008 
3009   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3010                                                  SimpleRuntimeFrame::framesize >> 1);
3011 }
3012 #endif // COMPILER2
3013 
3014 //------------------------------generate_handler_blob------
3015 //
3016 // Generate a special Compile2Runtime blob that saves all registers,
3017 // and setup oopmap.
3018 //
3019 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3020   assert(StubRoutines::forward_exception_entry() != NULL,
3021          "must be generated before");
3022 
3023   ResourceMark rm;
3024   OopMapSet *oop_maps = new OopMapSet();
3025   OopMap* map;
3026 
3027   // Allocate space for the code.  Setup code generation tools.
3028   CodeBuffer buffer("handler_blob", 2048, 1024);
3029   MacroAssembler* masm = new MacroAssembler(&buffer);
3030 
3031   address start   = __ pc();
3032   address call_pc = NULL;
3033   int frame_size_in_words;
3034   bool cause_return = (poll_type == POLL_AT_RETURN);
3035   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3036 
3037   if (UseRTMLocking) {
3038     // Abort RTM transaction before calling runtime
3039     // because critical section will be large and will be
3040     // aborted anyway. Also nmethod could be deoptimized.
3041     __ xabort(0);
3042   }
3043 
3044   // Make room for return address (or push it again)
3045   if (!cause_return) {
3046     __ push(rbx);
3047   }
3048 
3049   // Save registers, fpu state, and flags
3050   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3051 
3052   // The following is basically a call_VM.  However, we need the precise
3053   // address of the call in order to generate an oopmap. Hence, we do all the
3054   // work outselves.
3055 
3056   __ set_last_Java_frame(noreg, noreg, NULL);
3057 
3058   // The return address must always be correct so that frame constructor never
3059   // sees an invalid pc.
3060 
3061   if (!cause_return) {
3062     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3063     // Additionally, rbx is a callee saved register and we can look at it later to determine
3064     // if someone changed the return address for us!
3065     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3066     __ movptr(Address(rbp, wordSize), rbx);
3067   }
3068 
3069   // Do the call
3070   __ mov(c_rarg0, r15_thread);
3071   __ call(RuntimeAddress(call_ptr));
3072 
3073   // Set an oopmap for the call site.  This oopmap will map all
3074   // oop-registers and debug-info registers as callee-saved.  This
3075   // will allow deoptimization at this safepoint to find all possible
3076   // debug-info recordings, as well as let GC find all oops.
3077 
3078   oop_maps->add_gc_map( __ pc() - start, map);
3079 
3080   Label noException;
3081 
3082   __ reset_last_Java_frame(false);
3083 
3084   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3085   __ jcc(Assembler::equal, noException);
3086 
3087   // Exception pending
3088 
3089   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3090 
3091   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3092 
3093   // No exception case
3094   __ bind(noException);
3095 
3096   Label no_adjust;
3097 #ifdef ASSERT
3098   Label bail;
3099 #endif
3100   if (!cause_return) {
3101     Label no_prefix, not_special;
3102 
3103     // If our stashed return pc was modified by the runtime we avoid touching it
3104     __ cmpptr(rbx, Address(rbp, wordSize));
3105     __ jccb(Assembler::notEqual, no_adjust);
3106 
3107     // Skip over the poll instruction.
3108     // See NativeInstruction::is_safepoint_poll()
3109     // Possible encodings:
3110     //      85 00       test   %eax,(%rax)
3111     //      85 01       test   %eax,(%rcx)
3112     //      85 02       test   %eax,(%rdx)
3113     //      85 03       test   %eax,(%rbx)
3114     //      85 06       test   %eax,(%rsi)
3115     //      85 07       test   %eax,(%rdi)
3116     //
3117     //   41 85 00       test   %eax,(%r8)
3118     //   41 85 01       test   %eax,(%r9)
3119     //   41 85 02       test   %eax,(%r10)
3120     //   41 85 03       test   %eax,(%r11)
3121     //   41 85 06       test   %eax,(%r14)
3122     //   41 85 07       test   %eax,(%r15)
3123     //
3124     //      85 04 24    test   %eax,(%rsp)
3125     //   41 85 04 24    test   %eax,(%r12)
3126     //      85 45 00    test   %eax,0x0(%rbp)
3127     //   41 85 45 00    test   %eax,0x0(%r13)
3128 
3129     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3130     __ jcc(Assembler::notEqual, no_prefix);
3131     __ addptr(rbx, 1);
3132     __ bind(no_prefix);
3133 #ifdef ASSERT
3134     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3135 #endif
3136     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3137     // r12/rsp 0x04
3138     // r13/rbp 0x05
3139     __ movzbq(rcx, Address(rbx, 1));
3140     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3141     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3142     __ cmpptr(rcx, 1);
3143     __ jcc(Assembler::above, not_special);
3144     __ addptr(rbx, 1);
3145     __ bind(not_special);
3146 #ifdef ASSERT
3147     // Verify the correct encoding of the poll we're about to skip.
3148     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3149     __ jcc(Assembler::notEqual, bail);
3150     // Mask out the modrm bits
3151     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3152     // rax encodes to 0, so if the bits are nonzero it's incorrect
3153     __ jcc(Assembler::notZero, bail);
3154 #endif
3155     // Adjust return pc forward to step over the safepoint poll instruction
3156     __ addptr(rbx, 2);
3157     __ movptr(Address(rbp, wordSize), rbx);
3158   }
3159 
3160   __ bind(no_adjust);
3161   // Normal exit, restore registers and exit.
3162   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3163   __ ret(0);
3164 
3165 #ifdef ASSERT
3166   __ bind(bail);
3167   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3168 #endif
3169 
3170   // Make sure all code is generated
3171   masm->flush();
3172 
3173   // Fill-out other meta info
3174   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3175 }
3176 
3177 //
3178 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3179 //
3180 // Generate a stub that calls into vm to find out the proper destination
3181 // of a java call. All the argument registers are live at this point
3182 // but since this is generic code we don't know what they are and the caller
3183 // must do any gc of the args.
3184 //
3185 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3186   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3187 
3188   // allocate space for the code
3189   ResourceMark rm;
3190 
3191   CodeBuffer buffer(name, 1000, 512);
3192   MacroAssembler* masm                = new MacroAssembler(&buffer);
3193 
3194   int frame_size_in_words;
3195 
3196   OopMapSet *oop_maps = new OopMapSet();
3197   OopMap* map = NULL;
3198 
3199   int start = __ offset();
3200 
3201   // No need to save vector registers since they are caller-saved anyway.
3202   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3203 
3204   int frame_complete = __ offset();
3205 
3206   __ set_last_Java_frame(noreg, noreg, NULL);
3207 
3208   __ mov(c_rarg0, r15_thread);
3209 
3210   __ call(RuntimeAddress(destination));
3211 
3212 
3213   // Set an oopmap for the call site.
3214   // We need this not only for callee-saved registers, but also for volatile
3215   // registers that the compiler might be keeping live across a safepoint.
3216 
3217   oop_maps->add_gc_map( __ offset() - start, map);
3218 
3219   // rax contains the address we are going to jump to assuming no exception got installed
3220 
3221   // clear last_Java_sp
3222   __ reset_last_Java_frame(false);
3223   // check for pending exceptions
3224   Label pending;
3225   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3226   __ jcc(Assembler::notEqual, pending);
3227 
3228   // get the returned Method*
3229   __ get_vm_result_2(rbx, r15_thread);
3230   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3231 
3232   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3233 
3234   RegisterSaver::restore_live_registers(masm);
3235 
3236   // We are back the the original state on entry and ready to go.
3237 
3238   __ jmp(rax);
3239 
3240   // Pending exception after the safepoint
3241 
3242   __ bind(pending);
3243 
3244   RegisterSaver::restore_live_registers(masm);
3245 
3246   // exception pending => remove activation and forward to exception handler
3247 
3248   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3249 
3250   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3251   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3252 
3253   // -------------
3254   // make sure all code is generated
3255   masm->flush();
3256 
3257   // return the  blob
3258   // frame_size_words or bytes??
3259   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3260 }
3261 
3262 #ifdef COMPILER2
3263 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3264 
3265 class NativeInvokerGenerator : public StubCodeGenerator {
3266   address _call_target;
3267   int _shadow_space_bytes;
3268 
3269   const GrowableArray<VMReg>& _input_registers;
3270   const GrowableArray<VMReg>& _output_registers;
3271 
3272   int _frame_complete;
3273   int _framesize;
3274   OopMapSet* _oop_maps;
3275 public:
3276   NativeInvokerGenerator(CodeBuffer* buffer,
3277                          address call_target,
3278                          int shadow_space_bytes,
3279                          const GrowableArray<VMReg>& input_registers,
3280                          const GrowableArray<VMReg>& output_registers)
3281    : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3282      _call_target(call_target),
3283      _shadow_space_bytes(shadow_space_bytes),
3284      _input_registers(input_registers),
3285      _output_registers(output_registers),
3286      _frame_complete(0),
3287      _framesize(0),
3288      _oop_maps(NULL) {
3289     assert(_output_registers.length() <= 1
3290            || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3291 
3292   }
3293 
3294   void generate();
3295 
3296   int spill_size_in_bytes() const {
3297     if (_output_registers.length() == 0) {
3298       return 0;
3299     }
3300     VMReg reg = _output_registers.at(0);
3301     assert(reg->is_reg(), "must be a register");
3302     if (reg->is_Register()) {
3303       return 8;
3304     } else if (reg->is_XMMRegister()) {
3305       if (UseAVX >= 3) {
3306         return 64;
3307       } else if (UseAVX >= 1) {
3308         return 32;
3309       } else {
3310         return 16;
3311       }
3312     } else {
3313       ShouldNotReachHere();
3314     }
3315     return 0;
3316   }
3317 
3318   void spill_out_registers() {
3319     if (_output_registers.length() == 0) {
3320       return;
3321     }
3322     VMReg reg = _output_registers.at(0);
3323     assert(reg->is_reg(), "must be a register");
3324     MacroAssembler* masm = _masm;
3325     if (reg->is_Register()) {
3326       __ movptr(Address(rsp, 0), reg->as_Register());
3327     } else if (reg->is_XMMRegister()) {
3328       if (UseAVX >= 3) {
3329         __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3330       } else if (UseAVX >= 1) {
3331         __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3332       } else {
3333         __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3334       }
3335     } else {
3336       ShouldNotReachHere();
3337     }
3338   }
3339 
3340   void fill_out_registers() {
3341     if (_output_registers.length() == 0) {
3342       return;
3343     }
3344     VMReg reg = _output_registers.at(0);
3345     assert(reg->is_reg(), "must be a register");
3346     MacroAssembler* masm = _masm;
3347     if (reg->is_Register()) {
3348       __ movptr(reg->as_Register(), Address(rsp, 0));
3349     } else if (reg->is_XMMRegister()) {
3350       if (UseAVX >= 3) {
3351         __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3352       } else if (UseAVX >= 1) {
3353         __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3354       } else {
3355         __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3356       }
3357     } else {
3358       ShouldNotReachHere();
3359     }
3360   }
3361 
3362   int frame_complete() const {
3363     return _frame_complete;
3364   }
3365 
3366   int framesize() const {
3367     return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3368   }
3369 
3370   OopMapSet* oop_maps() const {
3371     return _oop_maps;
3372   }
3373 
3374 private:
3375 #ifdef ASSERT
3376 bool target_uses_register(VMReg reg) {
3377   return _input_registers.contains(reg) || _output_registers.contains(reg);
3378 }
3379 #endif
3380 };
3381 
3382 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3383                                                 int shadow_space_bytes,
3384                                                 const GrowableArray<VMReg>& input_registers,
3385                                                 const GrowableArray<VMReg>& output_registers) {
3386   int locs_size  = 64;
3387   CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3388   NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3389   g.generate();
3390   code.log_section_sizes("nep_invoker_blob");
3391 
3392   RuntimeStub* stub =
3393     RuntimeStub::new_runtime_stub("nep_invoker_blob",
3394                                   &code,
3395                                   g.frame_complete(),
3396                                   g.framesize(),
3397                                   g.oop_maps(), false);
3398   return stub;
3399 }
3400 
3401 void NativeInvokerGenerator::generate() {
3402   assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3403 
3404   enum layout {
3405     rbp_off,
3406     rbp_off2,
3407     return_off,
3408     return_off2,
3409     framesize // inclusive of return address
3410   };
3411 
3412   _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3413   assert(is_even(_framesize/2), "sp not 16-byte aligned");
3414 
3415   _oop_maps  = new OopMapSet();
3416   MacroAssembler* masm = _masm;
3417 
3418   address start = __ pc();
3419 
3420   __ enter();
3421 
3422   // return address and rbp are already in place
3423   __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3424 
3425   _frame_complete = __ pc() - start;
3426 
3427   address the_pc = __ pc();
3428 
3429   __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3430   OopMap* map = new OopMap(_framesize, 0);
3431   _oop_maps->add_gc_map(the_pc - start, map);
3432 
3433   // State transition
3434   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3435 
3436   __ call(RuntimeAddress(_call_target));
3437 
3438   __ restore_cpu_control_state_after_jni();
3439 
3440   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3441 
3442   // Force this write out before the read below
3443   __ membar(Assembler::Membar_mask_bits(
3444           Assembler::LoadLoad | Assembler::LoadStore |
3445           Assembler::StoreLoad | Assembler::StoreStore));
3446 
3447   Label L_after_safepoint_poll;
3448   Label L_safepoint_poll_slow_path;
3449 
3450   __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3451   __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3452   __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3453 
3454   __ bind(L_after_safepoint_poll);
3455 
3456   // change thread state
3457   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3458 
3459   __ block_comment("reguard stack check");
3460   Label L_reguard;
3461   Label L_after_reguard;
3462   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3463   __ jcc(Assembler::equal, L_reguard);
3464   __ bind(L_after_reguard);
3465 
3466   __ reset_last_Java_frame(r15_thread, true);
3467 
3468   __ leave(); // required for proper stackwalking of RuntimeStub frame
3469   __ ret(0);
3470 
3471   //////////////////////////////////////////////////////////////////////////////
3472 
3473   __ block_comment("{ L_safepoint_poll_slow_path");
3474   __ bind(L_safepoint_poll_slow_path);
3475   __ vzeroupper();
3476 
3477   spill_out_registers();
3478 
3479   __ mov(c_rarg0, r15_thread);
3480   __ mov(r12, rsp); // remember sp
3481   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3482   __ andptr(rsp, -16); // align stack as required by ABI
3483   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3484   __ mov(rsp, r12); // restore sp
3485   __ reinit_heapbase();
3486 
3487   fill_out_registers();
3488 
3489   __ jmp(L_after_safepoint_poll);
3490   __ block_comment("} L_safepoint_poll_slow_path");
3491 
3492   //////////////////////////////////////////////////////////////////////////////
3493 
3494   __ block_comment("{ L_reguard");
3495   __ bind(L_reguard);
3496   __ vzeroupper();
3497 
3498   spill_out_registers();
3499 
3500   __ mov(r12, rsp); // remember sp
3501   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3502   __ andptr(rsp, -16); // align stack as required by ABI
3503   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3504   __ mov(rsp, r12); // restore sp
3505   __ reinit_heapbase();
3506 
3507   fill_out_registers();
3508 
3509   __ jmp(L_after_reguard);
3510 
3511   __ block_comment("} L_reguard");
3512 
3513   //////////////////////////////////////////////////////////////////////////////
3514 
3515   __ flush();
3516 }
3517 #endif // COMPILER2
3518 
3519 //------------------------------Montgomery multiplication------------------------
3520 //
3521 
3522 #ifndef _WINDOWS
3523 
3524 // Subtract 0:b from carry:a.  Return carry.
3525 static julong
3526 sub(julong a[], julong b[], julong carry, long len) {
3527   long long i = 0, cnt = len;
3528   julong tmp;
3529   asm volatile("clc; "
3530                "0: ; "
3531                "mov (%[b], %[i], 8), %[tmp]; "
3532                "sbb %[tmp], (%[a], %[i], 8); "
3533                "inc %[i]; dec %[cnt]; "
3534                "jne 0b; "
3535                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3536                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3537                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3538                : "memory");
3539   return tmp;
3540 }
3541 
3542 // Multiply (unsigned) Long A by Long B, accumulating the double-
3543 // length result into the accumulator formed of T0, T1, and T2.
3544 #define MACC(A, B, T0, T1, T2)                                  \
3545 do {                                                            \
3546   unsigned long hi, lo;                                         \
3547   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3548            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3549            : "r"(A), "a"(B) : "cc");                            \
3550  } while(0)
3551 
3552 // As above, but add twice the double-length result into the
3553 // accumulator.
3554 #define MACC2(A, B, T0, T1, T2)                                 \
3555 do {                                                            \
3556   unsigned long hi, lo;                                         \
3557   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3558            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3559            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3560            : "r"(A), "a"(B) : "cc");                            \
3561  } while(0)
3562 
3563 #else //_WINDOWS
3564 
3565 static julong
3566 sub(julong a[], julong b[], julong carry, long len) {
3567   long i;
3568   julong tmp;
3569   unsigned char c = 1;
3570   for (i = 0; i < len; i++) {
3571     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3572     a[i] = tmp;
3573   }
3574   c = _addcarry_u64(c, carry, ~0, &tmp);
3575   return tmp;
3576 }
3577 
3578 // Multiply (unsigned) Long A by Long B, accumulating the double-
3579 // length result into the accumulator formed of T0, T1, and T2.
3580 #define MACC(A, B, T0, T1, T2)                          \
3581 do {                                                    \
3582   julong hi, lo;                            \
3583   lo = _umul128(A, B, &hi);                             \
3584   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3585   c = _addcarry_u64(c, hi, T1, &T1);                    \
3586   _addcarry_u64(c, T2, 0, &T2);                         \
3587  } while(0)
3588 
3589 // As above, but add twice the double-length result into the
3590 // accumulator.
3591 #define MACC2(A, B, T0, T1, T2)                         \
3592 do {                                                    \
3593   julong hi, lo;                            \
3594   lo = _umul128(A, B, &hi);                             \
3595   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3596   c = _addcarry_u64(c, hi, T1, &T1);                    \
3597   _addcarry_u64(c, T2, 0, &T2);                         \
3598   c = _addcarry_u64(0, lo, T0, &T0);                    \
3599   c = _addcarry_u64(c, hi, T1, &T1);                    \
3600   _addcarry_u64(c, T2, 0, &T2);                         \
3601  } while(0)
3602 
3603 #endif //_WINDOWS
3604 
3605 // Fast Montgomery multiplication.  The derivation of the algorithm is
3606 // in  A Cryptographic Library for the Motorola DSP56000,
3607 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3608 
3609 static void NOINLINE
3610 montgomery_multiply(julong a[], julong b[], julong n[],
3611                     julong m[], julong inv, int len) {
3612   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3613   int i;
3614 
3615   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3616 
3617   for (i = 0; i < len; i++) {
3618     int j;
3619     for (j = 0; j < i; j++) {
3620       MACC(a[j], b[i-j], t0, t1, t2);
3621       MACC(m[j], n[i-j], t0, t1, t2);
3622     }
3623     MACC(a[i], b[0], t0, t1, t2);
3624     m[i] = t0 * inv;
3625     MACC(m[i], n[0], t0, t1, t2);
3626 
3627     assert(t0 == 0, "broken Montgomery multiply");
3628 
3629     t0 = t1; t1 = t2; t2 = 0;
3630   }
3631 
3632   for (i = len; i < 2*len; i++) {
3633     int j;
3634     for (j = i-len+1; j < len; j++) {
3635       MACC(a[j], b[i-j], t0, t1, t2);
3636       MACC(m[j], n[i-j], t0, t1, t2);
3637     }
3638     m[i-len] = t0;
3639     t0 = t1; t1 = t2; t2 = 0;
3640   }
3641 
3642   while (t0)
3643     t0 = sub(m, n, t0, len);
3644 }
3645 
3646 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3647 // multiplies so it should be up to 25% faster than Montgomery
3648 // multiplication.  However, its loop control is more complex and it
3649 // may actually run slower on some machines.
3650 
3651 static void NOINLINE
3652 montgomery_square(julong a[], julong n[],
3653                   julong m[], julong inv, int len) {
3654   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3655   int i;
3656 
3657   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3658 
3659   for (i = 0; i < len; i++) {
3660     int j;
3661     int end = (i+1)/2;
3662     for (j = 0; j < end; j++) {
3663       MACC2(a[j], a[i-j], t0, t1, t2);
3664       MACC(m[j], n[i-j], t0, t1, t2);
3665     }
3666     if ((i & 1) == 0) {
3667       MACC(a[j], a[j], t0, t1, t2);
3668     }
3669     for (; j < i; j++) {
3670       MACC(m[j], n[i-j], t0, t1, t2);
3671     }
3672     m[i] = t0 * inv;
3673     MACC(m[i], n[0], t0, t1, t2);
3674 
3675     assert(t0 == 0, "broken Montgomery square");
3676 
3677     t0 = t1; t1 = t2; t2 = 0;
3678   }
3679 
3680   for (i = len; i < 2*len; i++) {
3681     int start = i-len+1;
3682     int end = start + (len - start)/2;
3683     int j;
3684     for (j = start; j < end; j++) {
3685       MACC2(a[j], a[i-j], t0, t1, t2);
3686       MACC(m[j], n[i-j], t0, t1, t2);
3687     }
3688     if ((i & 1) == 0) {
3689       MACC(a[j], a[j], t0, t1, t2);
3690     }
3691     for (; j < len; j++) {
3692       MACC(m[j], n[i-j], t0, t1, t2);
3693     }
3694     m[i-len] = t0;
3695     t0 = t1; t1 = t2; t2 = 0;
3696   }
3697 
3698   while (t0)
3699     t0 = sub(m, n, t0, len);
3700 }
3701 
3702 // Swap words in a longword.
3703 static julong swap(julong x) {
3704   return (x << 32) | (x >> 32);
3705 }
3706 
3707 // Copy len longwords from s to d, word-swapping as we go.  The
3708 // destination array is reversed.
3709 static void reverse_words(julong *s, julong *d, int len) {
3710   d += len;
3711   while(len-- > 0) {
3712     d--;
3713     *d = swap(*s);
3714     s++;
3715   }
3716 }
3717 
3718 // The threshold at which squaring is advantageous was determined
3719 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3720 #define MONTGOMERY_SQUARING_THRESHOLD 64
3721 
3722 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3723                                         jint len, jlong inv,
3724                                         jint *m_ints) {
3725   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3726   int longwords = len/2;
3727 
3728   // Make very sure we don't use so much space that the stack might
3729   // overflow.  512 jints corresponds to an 16384-bit integer and
3730   // will use here a total of 8k bytes of stack space.
3731   int divisor = sizeof(julong) * 4;
3732   guarantee(longwords <= 8192 / divisor, "must be");
3733   int total_allocation = longwords * sizeof (julong) * 4;
3734   julong *scratch = (julong *)alloca(total_allocation);
3735 
3736   // Local scratch arrays
3737   julong
3738     *a = scratch + 0 * longwords,
3739     *b = scratch + 1 * longwords,
3740     *n = scratch + 2 * longwords,
3741     *m = scratch + 3 * longwords;
3742 
3743   reverse_words((julong *)a_ints, a, longwords);
3744   reverse_words((julong *)b_ints, b, longwords);
3745   reverse_words((julong *)n_ints, n, longwords);
3746 
3747   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3748 
3749   reverse_words(m, (julong *)m_ints, longwords);
3750 }
3751 
3752 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3753                                       jint len, jlong inv,
3754                                       jint *m_ints) {
3755   assert(len % 2 == 0, "array length in montgomery_square must be even");
3756   int longwords = len/2;
3757 
3758   // Make very sure we don't use so much space that the stack might
3759   // overflow.  512 jints corresponds to an 16384-bit integer and
3760   // will use here a total of 6k bytes of stack space.
3761   int divisor = sizeof(julong) * 3;
3762   guarantee(longwords <= (8192 / divisor), "must be");
3763   int total_allocation = longwords * sizeof (julong) * 3;
3764   julong *scratch = (julong *)alloca(total_allocation);
3765 
3766   // Local scratch arrays
3767   julong
3768     *a = scratch + 0 * longwords,
3769     *n = scratch + 1 * longwords,
3770     *m = scratch + 2 * longwords;
3771 
3772   reverse_words((julong *)a_ints, a, longwords);
3773   reverse_words((julong *)n_ints, n, longwords);
3774 
3775   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3776     ::montgomery_square(a, n, m, (julong)inv, longwords);
3777   } else {
3778     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3779   }
3780 
3781   reverse_words(m, (julong *)m_ints, longwords);
3782 }
3783 
3784 #ifdef COMPILER2
3785 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3786 //
3787 //------------------------------generate_exception_blob---------------------------
3788 // creates exception blob at the end
3789 // Using exception blob, this code is jumped from a compiled method.
3790 // (see emit_exception_handler in x86_64.ad file)
3791 //
3792 // Given an exception pc at a call we call into the runtime for the
3793 // handler in this method. This handler might merely restore state
3794 // (i.e. callee save registers) unwind the frame and jump to the
3795 // exception handler for the nmethod if there is no Java level handler
3796 // for the nmethod.
3797 //
3798 // This code is entered with a jmp.
3799 //
3800 // Arguments:
3801 //   rax: exception oop
3802 //   rdx: exception pc
3803 //
3804 // Results:
3805 //   rax: exception oop
3806 //   rdx: exception pc in caller or ???
3807 //   destination: exception handler of caller
3808 //
3809 // Note: the exception pc MUST be at a call (precise debug information)
3810 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3811 //
3812 
3813 void OptoRuntime::generate_exception_blob() {
3814   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3815   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3816   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3817 
3818   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3819 
3820   // Allocate space for the code
3821   ResourceMark rm;
3822   // Setup code generation tools
3823   CodeBuffer buffer("exception_blob", 2048, 1024);
3824   MacroAssembler* masm = new MacroAssembler(&buffer);
3825 
3826 
3827   address start = __ pc();
3828 
3829   // Exception pc is 'return address' for stack walker
3830   __ push(rdx);
3831   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3832 
3833   // Save callee-saved registers.  See x86_64.ad.
3834 
3835   // rbp is an implicitly saved callee saved register (i.e., the calling
3836   // convention will save/restore it in the prolog/epilog). Other than that
3837   // there are no callee save registers now that adapter frames are gone.
3838 
3839   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3840 
3841   // Store exception in Thread object. We cannot pass any arguments to the
3842   // handle_exception call, since we do not want to make any assumption
3843   // about the size of the frame where the exception happened in.
3844   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3845   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3846   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3847 
3848   // This call does all the hard work.  It checks if an exception handler
3849   // exists in the method.
3850   // If so, it returns the handler address.
3851   // If not, it prepares for stack-unwinding, restoring the callee-save
3852   // registers of the frame being removed.
3853   //
3854   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3855 
3856   // At a method handle call, the stack may not be properly aligned
3857   // when returning with an exception.
3858   address the_pc = __ pc();
3859   __ set_last_Java_frame(noreg, noreg, the_pc);
3860   __ mov(c_rarg0, r15_thread);
3861   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3862   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3863 
3864   // Set an oopmap for the call site.  This oopmap will only be used if we
3865   // are unwinding the stack.  Hence, all locations will be dead.
3866   // Callee-saved registers will be the same as the frame above (i.e.,
3867   // handle_exception_stub), since they were restored when we got the
3868   // exception.
3869 
3870   OopMapSet* oop_maps = new OopMapSet();
3871 
3872   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3873 
3874   __ reset_last_Java_frame(false);
3875 
3876   // Restore callee-saved registers
3877 
3878   // rbp is an implicitly saved callee-saved register (i.e., the calling
3879   // convention will save restore it in prolog/epilog) Other than that
3880   // there are no callee save registers now that adapter frames are gone.
3881 
3882   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3883 
3884   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3885   __ pop(rdx);                  // No need for exception pc anymore
3886 
3887   // rax: exception handler
3888 
3889   // We have a handler in rax (could be deopt blob).
3890   __ mov(r8, rax);
3891 
3892   // Get the exception oop
3893   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3894   // Get the exception pc in case we are deoptimized
3895   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3896 #ifdef ASSERT
3897   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3898   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3899 #endif
3900   // Clear the exception oop so GC no longer processes it as a root.
3901   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3902 
3903   // rax: exception oop
3904   // r8:  exception handler
3905   // rdx: exception pc
3906   // Jump to handler
3907 
3908   __ jmp(r8);
3909 
3910   // Make sure all code is generated
3911   masm->flush();
3912 
3913   // Set exception blob
3914   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3915 }
3916 #endif // COMPILER2
3917 
3918 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
3919                                        int total_in_args, const VMRegPair* in_regs,
3920                                        int total_out_args, VMRegPair* out_regs,
3921                                        GrowableArray<int>& arg_order,
3922                                        VMRegPair tmp_vmreg) {
3923   ComputeMoveOrder order(total_in_args, in_regs,
3924                          total_out_args, out_regs,
3925                          in_sig_bt, arg_order, tmp_vmreg);
3926 }