1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/icBuffer.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/compiledICHolder.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/jniHandles.hpp"
  48 #include "runtime/safepointMechanism.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/signature.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "runtime/vframeArray.hpp"
  53 #include "runtime/vm_version.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/formatBuffer.hpp"
  56 #include "vmreg_x86.inline.hpp"
  57 #ifdef COMPILER1
  58 #include "c1/c1_Runtime1.hpp"
  59 #endif
  60 #ifdef COMPILER2
  61 #include "opto/runtime.hpp"
  62 #endif
  63 #if INCLUDE_JVMCI
  64 #include "jvmci/jvmciJavaClasses.hpp"
  65 #endif
  66 
  67 #define __ masm->
  68 
  69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  70 
  71 class SimpleRuntimeFrame {
  72 
  73   public:
  74 
  75   // Most of the runtime stubs have this simple frame layout.
  76   // This class exists to make the layout shared in one place.
  77   // Offsets are for compiler stack slots, which are jints.
  78   enum layout {
  79     // The frame sender code expects that rbp will be in the "natural" place and
  80     // will override any oopMap setting for it. We must therefore force the layout
  81     // so that it agrees with the frame sender code.
  82     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  83     rbp_off2,
  84     return_off, return_off2,
  85     framesize
  86   };
  87 };
  88 
  89 class RegisterSaver {
  90   // Capture info about frame layout.  Layout offsets are in jint
  91   // units because compiler frame slots are jints.
  92 #define XSAVE_AREA_BEGIN 160
  93 #define XSAVE_AREA_YMM_BEGIN 576
  94 #define XSAVE_AREA_OPMASK_BEGIN 1088
  95 #define XSAVE_AREA_ZMM_BEGIN 1152
  96 #define XSAVE_AREA_UPPERBANK 1664
  97 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  98 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  99 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 100 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 102   enum layout {
 103     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 104     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 105     DEF_XMM_OFFS(0),
 106     DEF_XMM_OFFS(1),
 107     // 2..15 are implied in range usage
 108     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 109     DEF_YMM_OFFS(0),
 110     DEF_YMM_OFFS(1),
 111     // 2..15 are implied in range usage
 112     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 113     DEF_OPMASK_OFFS(0),
 114     DEF_OPMASK_OFFS(1),
 115     // 2..7 are implied in range usage
 116     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 117     DEF_ZMM_OFFS(0),
 118     DEF_ZMM_OFFS(1),
 119     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 120     DEF_ZMM_UPPER_OFFS(16),
 121     DEF_ZMM_UPPER_OFFS(17),
 122     // 18..31 are implied in range usage
 123     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 124     fpu_stateH_end,
 125     r15_off, r15H_off,
 126     r14_off, r14H_off,
 127     r13_off, r13H_off,
 128     r12_off, r12H_off,
 129     r11_off, r11H_off,
 130     r10_off, r10H_off,
 131     r9_off,  r9H_off,
 132     r8_off,  r8H_off,
 133     rdi_off, rdiH_off,
 134     rsi_off, rsiH_off,
 135     ignore_off, ignoreH_off,  // extra copy of rbp
 136     rsp_off, rspH_off,
 137     rbx_off, rbxH_off,
 138     rdx_off, rdxH_off,
 139     rcx_off, rcxH_off,
 140     rax_off, raxH_off,
 141     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 142     align_off, alignH_off,
 143     flags_off, flagsH_off,
 144     // The frame sender code expects that rbp will be in the "natural" place and
 145     // will override any oopMap setting for it. We must therefore force the layout
 146     // so that it agrees with the frame sender code.
 147     rbp_off, rbpH_off,        // copy of rbp we will restore
 148     return_off, returnH_off,  // slot for return address
 149     reg_save_size             // size in compiler stack slots
 150   };
 151 
 152  public:
 153   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 154   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 155 
 156   // Offsets into the register save area
 157   // Used by deoptimization when it is managing result register
 158   // values on its own
 159 
 160   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 161   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 162   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 163   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 164   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 165 
 166   // During deoptimization only the result registers need to be restored,
 167   // all the other values have already been extracted.
 168   static void restore_result_registers(MacroAssembler* masm);
 169 };
 170 
 171 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 172   int off = 0;
 173   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 174   if (UseAVX < 3) {
 175     num_xmm_regs = num_xmm_regs/2;
 176   }
 177 #if COMPILER2_OR_JVMCI
 178   if (save_wide_vectors && UseAVX == 0) {
 179     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 180   }
 181   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 182 #else
 183   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 184 #endif
 185 
 186   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 187   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 188   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 189   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 190   // CodeBlob frame size is in words.
 191   int frame_size_in_words = frame_size_in_bytes / wordSize;
 192   *total_frame_words = frame_size_in_words;
 193 
 194   // Save registers, fpu state, and flags.
 195   // We assume caller has already pushed the return address onto the
 196   // stack, so rsp is 8-byte aligned here.
 197   // We push rpb twice in this sequence because we want the real rbp
 198   // to be under the return like a normal enter.
 199 
 200   __ enter();          // rsp becomes 16-byte aligned here
 201   __ push_CPU_state(); // Push a multiple of 16 bytes
 202 
 203   // push cpu state handles this on EVEX enabled targets
 204   if (save_wide_vectors) {
 205     // Save upper half of YMM registers(0..15)
 206     int base_addr = XSAVE_AREA_YMM_BEGIN;
 207     for (int n = 0; n < 16; n++) {
 208       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 209     }
 210     if (VM_Version::supports_evex()) {
 211       // Save upper half of ZMM registers(0..15)
 212       base_addr = XSAVE_AREA_ZMM_BEGIN;
 213       for (int n = 0; n < 16; n++) {
 214         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 215       }
 216       // Save full ZMM registers(16..num_xmm_regs)
 217       base_addr = XSAVE_AREA_UPPERBANK;
 218       off = 0;
 219       int vector_len = Assembler::AVX_512bit;
 220       for (int n = 16; n < num_xmm_regs; n++) {
 221         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 222       }
 223 #if COMPILER2_OR_JVMCI
 224       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 225       off = 0;
 226       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 227         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 228       }
 229 #endif
 230     }
 231   } else {
 232     if (VM_Version::supports_evex()) {
 233       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 234       int base_addr = XSAVE_AREA_UPPERBANK;
 235       off = 0;
 236       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 237       for (int n = 16; n < num_xmm_regs; n++) {
 238         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 239       }
 240 #if COMPILER2_OR_JVMCI
 241       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 242       off = 0;
 243       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 244         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 245       }
 246 #endif
 247     }
 248   }
 249   __ vzeroupper();
 250   if (frame::arg_reg_save_area_bytes != 0) {
 251     // Allocate argument register save area
 252     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 253   }
 254 
 255   // Set an oopmap for the call site.  This oopmap will map all
 256   // oop-registers and debug-info registers as callee-saved.  This
 257   // will allow deoptimization at this safepoint to find all possible
 258   // debug-info recordings, as well as let GC find all oops.
 259 
 260   OopMapSet *oop_maps = new OopMapSet();
 261   OopMap* map = new OopMap(frame_size_in_slots, 0);
 262 
 263 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 264 
 265   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 266   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 267   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 269   // rbp location is known implicitly by the frame sender code, needs no oopmap
 270   // and the location where rbp was saved by is ignored
 271   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 272   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 273   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 281   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 282   // on EVEX enabled targets, we get it included in the xsave area
 283   off = xmm0_off;
 284   int delta = xmm1_off - off;
 285   for (int n = 0; n < 16; n++) {
 286     XMMRegister xmm_name = as_XMMRegister(n);
 287     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 288     off += delta;
 289   }
 290   if (UseAVX > 2) {
 291     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 292     off = zmm16_off;
 293     delta = zmm17_off - off;
 294     for (int n = 16; n < num_xmm_regs; n++) {
 295       XMMRegister zmm_name = as_XMMRegister(n);
 296       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 297       off += delta;
 298     }
 299   }
 300 
 301 #if COMPILER2_OR_JVMCI
 302   if (save_wide_vectors) {
 303     // Save upper half of YMM registers(0..15)
 304     off = ymm0_off;
 305     delta = ymm1_off - ymm0_off;
 306     for (int n = 0; n < 16; n++) {
 307       XMMRegister ymm_name = as_XMMRegister(n);
 308       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 309       off += delta;
 310     }
 311     if (VM_Version::supports_evex()) {
 312       // Save upper half of ZMM registers(0..15)
 313       off = zmm0_off;
 314       delta = zmm1_off - zmm0_off;
 315       for (int n = 0; n < 16; n++) {
 316         XMMRegister zmm_name = as_XMMRegister(n);
 317         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 318         off += delta;
 319       }
 320     }
 321   }
 322 #endif // COMPILER2_OR_JVMCI
 323 
 324   // %%% These should all be a waste but we'll keep things as they were for now
 325   if (true) {
 326     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 327     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 328     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 330     // rbp location is known implicitly by the frame sender code, needs no oopmap
 331     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 341     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 342     // on EVEX enabled targets, we get it included in the xsave area
 343     off = xmm0H_off;
 344     delta = xmm1H_off - off;
 345     for (int n = 0; n < 16; n++) {
 346       XMMRegister xmm_name = as_XMMRegister(n);
 347       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 348       off += delta;
 349     }
 350     if (UseAVX > 2) {
 351       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 352       off = zmm16H_off;
 353       delta = zmm17H_off - off;
 354       for (int n = 16; n < num_xmm_regs; n++) {
 355         XMMRegister zmm_name = as_XMMRegister(n);
 356         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 357         off += delta;
 358       }
 359     }
 360   }
 361 
 362   return map;
 363 }
 364 
 365 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 366   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 367   if (UseAVX < 3) {
 368     num_xmm_regs = num_xmm_regs/2;
 369   }
 370   if (frame::arg_reg_save_area_bytes != 0) {
 371     // Pop arg register save area
 372     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 373   }
 374 
 375 #if COMPILER2_OR_JVMCI
 376   if (restore_wide_vectors) {
 377     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 378     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 379   }
 380 #else
 381   assert(!restore_wide_vectors, "vectors are generated only by C2");
 382 #endif
 383 
 384   __ vzeroupper();
 385 
 386   // On EVEX enabled targets everything is handled in pop fpu state
 387   if (restore_wide_vectors) {
 388     // Restore upper half of YMM registers (0..15)
 389     int base_addr = XSAVE_AREA_YMM_BEGIN;
 390     for (int n = 0; n < 16; n++) {
 391       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 392     }
 393     if (VM_Version::supports_evex()) {
 394       // Restore upper half of ZMM registers (0..15)
 395       base_addr = XSAVE_AREA_ZMM_BEGIN;
 396       for (int n = 0; n < 16; n++) {
 397         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 398       }
 399       // Restore full ZMM registers(16..num_xmm_regs)
 400       base_addr = XSAVE_AREA_UPPERBANK;
 401       int vector_len = Assembler::AVX_512bit;
 402       int off = 0;
 403       for (int n = 16; n < num_xmm_regs; n++) {
 404         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 405       }
 406 #if COMPILER2_OR_JVMCI
 407       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 408       off = 0;
 409       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 410         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 411       }
 412 #endif
 413     }
 414   } else {
 415     if (VM_Version::supports_evex()) {
 416       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 417       int base_addr = XSAVE_AREA_UPPERBANK;
 418       int off = 0;
 419       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 420       for (int n = 16; n < num_xmm_regs; n++) {
 421         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 422       }
 423 #if COMPILER2_OR_JVMCI
 424       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 425       off = 0;
 426       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 427         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 428       }
 429 #endif
 430     }
 431   }
 432 
 433   // Recover CPU state
 434   __ pop_CPU_state();
 435   // Get the rbp described implicitly by the calling convention (no oopMap)
 436   __ pop(rbp);
 437 }
 438 
 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 440 
 441   // Just restore result register. Only used by deoptimization. By
 442   // now any callee save register that needs to be restored to a c2
 443   // caller of the deoptee has been extracted into the vframeArray
 444   // and will be stuffed into the c2i adapter we create for later
 445   // restoration so only result registers need to be restored here.
 446 
 447   // Restore fp result register
 448   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 449   // Restore integer result register
 450   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 451   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 452 
 453   // Pop all of the register save are off the stack except the return address
 454   __ addptr(rsp, return_offset_in_bytes());
 455 }
 456 
 457 // Is vector's size (in bytes) bigger than a size saved by default?
 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 459 bool SharedRuntime::is_wide_vector(int size) {
 460   return size > 16;
 461 }
 462 
 463 // ---------------------------------------------------------------------------
 464 // Read the array of BasicTypes from a signature, and compute where the
 465 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 466 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 467 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 468 // as framesizes are fixed.
 469 // VMRegImpl::stack0 refers to the first slot 0(sp).
 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 471 // up to RegisterImpl::number_of_registers) are the 64-bit
 472 // integer registers.
 473 
 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 475 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 476 // units regardless of build. Of course for i486 there is no 64 bit build
 477 
 478 // The Java calling convention is a "shifted" version of the C ABI.
 479 // By skipping the first C ABI register we can call non-static jni methods
 480 // with small numbers of arguments without having to shuffle the arguments
 481 // at all. Since we control the java ABI we ought to at least get some
 482 // advantage out of it.
 483 
 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 485                                            VMRegPair *regs,
 486                                            int total_args_passed) {
 487 
 488   // Create the mapping between argument positions and
 489   // registers.
 490   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 491     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 492   };
 493   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 494     j_farg0, j_farg1, j_farg2, j_farg3,
 495     j_farg4, j_farg5, j_farg6, j_farg7
 496   };
 497 
 498 
 499   uint int_args = 0;
 500   uint fp_args = 0;
 501   uint stk_args = 0; // inc by 2 each time
 502 
 503   for (int i = 0; i < total_args_passed; i++) {
 504     switch (sig_bt[i]) {
 505     case T_BOOLEAN:
 506     case T_CHAR:
 507     case T_BYTE:
 508     case T_SHORT:
 509     case T_INT:
 510       if (int_args < Argument::n_int_register_parameters_j) {
 511         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 512       } else {
 513         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 514         stk_args += 2;
 515       }
 516       break;
 517     case T_VOID:
 518       // halves of T_LONG or T_DOUBLE
 519       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 520       regs[i].set_bad();
 521       break;
 522     case T_LONG:
 523       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 524       // fall through
 525     case T_OBJECT:
 526     case T_ARRAY:
 527     case T_ADDRESS:
 528       if (int_args < Argument::n_int_register_parameters_j) {
 529         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 530       } else {
 531         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 532         stk_args += 2;
 533       }
 534       break;
 535     case T_FLOAT:
 536       if (fp_args < Argument::n_float_register_parameters_j) {
 537         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 538       } else {
 539         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 540         stk_args += 2;
 541       }
 542       break;
 543     case T_DOUBLE:
 544       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 545       if (fp_args < Argument::n_float_register_parameters_j) {
 546         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 547       } else {
 548         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 549         stk_args += 2;
 550       }
 551       break;
 552     default:
 553       ShouldNotReachHere();
 554       break;
 555     }
 556   }
 557 
 558   return align_up(stk_args, 2);
 559 }
 560 
 561 // Patch the callers callsite with entry to compiled code if it exists.
 562 static void patch_callers_callsite(MacroAssembler *masm) {
 563   Label L;
 564   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 565   __ jcc(Assembler::equal, L);
 566 
 567   // Save the current stack pointer
 568   __ mov(r13, rsp);
 569   // Schedule the branch target address early.
 570   // Call into the VM to patch the caller, then jump to compiled callee
 571   // rax isn't live so capture return address while we easily can
 572   __ movptr(rax, Address(rsp, 0));
 573 
 574   // align stack so push_CPU_state doesn't fault
 575   __ andptr(rsp, -(StackAlignmentInBytes));
 576   __ push_CPU_state();
 577   __ vzeroupper();
 578   // VM needs caller's callsite
 579   // VM needs target method
 580   // This needs to be a long call since we will relocate this adapter to
 581   // the codeBuffer and it may not reach
 582 
 583   // Allocate argument register save area
 584   if (frame::arg_reg_save_area_bytes != 0) {
 585     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 586   }
 587   __ mov(c_rarg0, rbx);
 588   __ mov(c_rarg1, rax);
 589   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 590 
 591   // De-allocate argument register save area
 592   if (frame::arg_reg_save_area_bytes != 0) {
 593     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 594   }
 595 
 596   __ vzeroupper();
 597   __ pop_CPU_state();
 598   // restore sp
 599   __ mov(rsp, r13);
 600   __ bind(L);
 601 }
 602 
 603 
 604 static void gen_c2i_adapter(MacroAssembler *masm,
 605                             int total_args_passed,
 606                             int comp_args_on_stack,
 607                             const BasicType *sig_bt,
 608                             const VMRegPair *regs,
 609                             Label& skip_fixup) {
 610   // Before we get into the guts of the C2I adapter, see if we should be here
 611   // at all.  We've come from compiled code and are attempting to jump to the
 612   // interpreter, which means the caller made a static call to get here
 613   // (vcalls always get a compiled target if there is one).  Check for a
 614   // compiled target.  If there is one, we need to patch the caller's call.
 615   patch_callers_callsite(masm);
 616 
 617   __ bind(skip_fixup);
 618 
 619   // Since all args are passed on the stack, total_args_passed *
 620   // Interpreter::stackElementSize is the space we need. Plus 1 because
 621   // we also account for the return address location since
 622   // we store it first rather than hold it in rax across all the shuffling
 623 
 624   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 625 
 626   // stack is aligned, keep it that way
 627   extraspace = align_up(extraspace, 2*wordSize);
 628 
 629   // Get return address
 630   __ pop(rax);
 631 
 632   // set senderSP value
 633   __ mov(r13, rsp);
 634 
 635   __ subptr(rsp, extraspace);
 636 
 637   // Store the return address in the expected location
 638   __ movptr(Address(rsp, 0), rax);
 639 
 640   // Now write the args into the outgoing interpreter space
 641   for (int i = 0; i < total_args_passed; i++) {
 642     if (sig_bt[i] == T_VOID) {
 643       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 644       continue;
 645     }
 646 
 647     // offset to start parameters
 648     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 649     int next_off = st_off - Interpreter::stackElementSize;
 650 
 651     // Say 4 args:
 652     // i   st_off
 653     // 0   32 T_LONG
 654     // 1   24 T_VOID
 655     // 2   16 T_OBJECT
 656     // 3    8 T_BOOL
 657     // -    0 return address
 658     //
 659     // However to make thing extra confusing. Because we can fit a long/double in
 660     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 661     // leaves one slot empty and only stores to a single slot. In this case the
 662     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 663 
 664     VMReg r_1 = regs[i].first();
 665     VMReg r_2 = regs[i].second();
 666     if (!r_1->is_valid()) {
 667       assert(!r_2->is_valid(), "");
 668       continue;
 669     }
 670     if (r_1->is_stack()) {
 671       // memory to memory use rax
 672       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 673       if (!r_2->is_valid()) {
 674         // sign extend??
 675         __ movl(rax, Address(rsp, ld_off));
 676         __ movptr(Address(rsp, st_off), rax);
 677 
 678       } else {
 679 
 680         __ movq(rax, Address(rsp, ld_off));
 681 
 682         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 683         // T_DOUBLE and T_LONG use two slots in the interpreter
 684         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 685           // ld_off == LSW, ld_off+wordSize == MSW
 686           // st_off == MSW, next_off == LSW
 687           __ movq(Address(rsp, next_off), rax);
 688 #ifdef ASSERT
 689           // Overwrite the unused slot with known junk
 690           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 691           __ movptr(Address(rsp, st_off), rax);
 692 #endif /* ASSERT */
 693         } else {
 694           __ movq(Address(rsp, st_off), rax);
 695         }
 696       }
 697     } else if (r_1->is_Register()) {
 698       Register r = r_1->as_Register();
 699       if (!r_2->is_valid()) {
 700         // must be only an int (or less ) so move only 32bits to slot
 701         // why not sign extend??
 702         __ movl(Address(rsp, st_off), r);
 703       } else {
 704         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 705         // T_DOUBLE and T_LONG use two slots in the interpreter
 706         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 707           // long/double in gpr
 708 #ifdef ASSERT
 709           // Overwrite the unused slot with known junk
 710           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 711           __ movptr(Address(rsp, st_off), rax);
 712 #endif /* ASSERT */
 713           __ movq(Address(rsp, next_off), r);
 714         } else {
 715           __ movptr(Address(rsp, st_off), r);
 716         }
 717       }
 718     } else {
 719       assert(r_1->is_XMMRegister(), "");
 720       if (!r_2->is_valid()) {
 721         // only a float use just part of the slot
 722         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 723       } else {
 724 #ifdef ASSERT
 725         // Overwrite the unused slot with known junk
 726         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 727         __ movptr(Address(rsp, st_off), rax);
 728 #endif /* ASSERT */
 729         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 730       }
 731     }
 732   }
 733 
 734   // Schedule the branch target address early.
 735   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 736   __ jmp(rcx);
 737 }
 738 
 739 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 740                         address code_start, address code_end,
 741                         Label& L_ok) {
 742   Label L_fail;
 743   __ lea(temp_reg, ExternalAddress(code_start));
 744   __ cmpptr(pc_reg, temp_reg);
 745   __ jcc(Assembler::belowEqual, L_fail);
 746   __ lea(temp_reg, ExternalAddress(code_end));
 747   __ cmpptr(pc_reg, temp_reg);
 748   __ jcc(Assembler::below, L_ok);
 749   __ bind(L_fail);
 750 }
 751 
 752 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 753                                     int total_args_passed,
 754                                     int comp_args_on_stack,
 755                                     const BasicType *sig_bt,
 756                                     const VMRegPair *regs) {
 757 
 758   // Note: r13 contains the senderSP on entry. We must preserve it since
 759   // we may do a i2c -> c2i transition if we lose a race where compiled
 760   // code goes non-entrant while we get args ready.
 761   // In addition we use r13 to locate all the interpreter args as
 762   // we must align the stack to 16 bytes on an i2c entry else we
 763   // lose alignment we expect in all compiled code and register
 764   // save code can segv when fxsave instructions find improperly
 765   // aligned stack pointer.
 766 
 767   // Adapters can be frameless because they do not require the caller
 768   // to perform additional cleanup work, such as correcting the stack pointer.
 769   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 770   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 771   // even if a callee has modified the stack pointer.
 772   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 773   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 774   // up via the senderSP register).
 775   // In other words, if *either* the caller or callee is interpreted, we can
 776   // get the stack pointer repaired after a call.
 777   // This is why c2i and i2c adapters cannot be indefinitely composed.
 778   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 779   // both caller and callee would be compiled methods, and neither would
 780   // clean up the stack pointer changes performed by the two adapters.
 781   // If this happens, control eventually transfers back to the compiled
 782   // caller, but with an uncorrected stack, causing delayed havoc.
 783 
 784   // Pick up the return address
 785   __ movptr(rax, Address(rsp, 0));
 786 
 787   if (VerifyAdapterCalls &&
 788       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 789     // So, let's test for cascading c2i/i2c adapters right now.
 790     //  assert(Interpreter::contains($return_addr) ||
 791     //         StubRoutines::contains($return_addr),
 792     //         "i2c adapter must return to an interpreter frame");
 793     __ block_comment("verify_i2c { ");
 794     Label L_ok;
 795     if (Interpreter::code() != NULL)
 796       range_check(masm, rax, r11,
 797                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 798                   L_ok);
 799     if (StubRoutines::code1() != NULL)
 800       range_check(masm, rax, r11,
 801                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 802                   L_ok);
 803     if (StubRoutines::code2() != NULL)
 804       range_check(masm, rax, r11,
 805                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 806                   L_ok);
 807     const char* msg = "i2c adapter must return to an interpreter frame";
 808     __ block_comment(msg);
 809     __ stop(msg);
 810     __ bind(L_ok);
 811     __ block_comment("} verify_i2ce ");
 812   }
 813 
 814   // Must preserve original SP for loading incoming arguments because
 815   // we need to align the outgoing SP for compiled code.
 816   __ movptr(r11, rsp);
 817 
 818   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
 819   // in registers, we will occasionally have no stack args.
 820   int comp_words_on_stack = 0;
 821   if (comp_args_on_stack) {
 822     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
 823     // registers are below.  By subtracting stack0, we either get a negative
 824     // number (all values in registers) or the maximum stack slot accessed.
 825 
 826     // Convert 4-byte c2 stack slots to words.
 827     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 828     // Round up to miminum stack alignment, in wordSize
 829     comp_words_on_stack = align_up(comp_words_on_stack, 2);
 830     __ subptr(rsp, comp_words_on_stack * wordSize);
 831   }
 832 
 833 
 834   // Ensure compiled code always sees stack at proper alignment
 835   __ andptr(rsp, -16);
 836 
 837   // push the return address and misalign the stack that youngest frame always sees
 838   // as far as the placement of the call instruction
 839   __ push(rax);
 840 
 841   // Put saved SP in another register
 842   const Register saved_sp = rax;
 843   __ movptr(saved_sp, r11);
 844 
 845   // Will jump to the compiled code just as if compiled code was doing it.
 846   // Pre-load the register-jump target early, to schedule it better.
 847   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 848 
 849 #if INCLUDE_JVMCI
 850   if (EnableJVMCI) {
 851     // check if this call should be routed towards a specific entry point
 852     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 853     Label no_alternative_target;
 854     __ jcc(Assembler::equal, no_alternative_target);
 855     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 856     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 857     __ bind(no_alternative_target);
 858   }
 859 #endif // INCLUDE_JVMCI
 860 
 861   // Now generate the shuffle code.  Pick up all register args and move the
 862   // rest through the floating point stack top.
 863   for (int i = 0; i < total_args_passed; i++) {
 864     if (sig_bt[i] == T_VOID) {
 865       // Longs and doubles are passed in native word order, but misaligned
 866       // in the 32-bit build.
 867       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 868       continue;
 869     }
 870 
 871     // Pick up 0, 1 or 2 words from SP+offset.
 872 
 873     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 874             "scrambled load targets?");
 875     // Load in argument order going down.
 876     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 877     // Point to interpreter value (vs. tag)
 878     int next_off = ld_off - Interpreter::stackElementSize;
 879     //
 880     //
 881     //
 882     VMReg r_1 = regs[i].first();
 883     VMReg r_2 = regs[i].second();
 884     if (!r_1->is_valid()) {
 885       assert(!r_2->is_valid(), "");
 886       continue;
 887     }
 888     if (r_1->is_stack()) {
 889       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 890       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 891 
 892       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 893       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 894       // will be generated.
 895       if (!r_2->is_valid()) {
 896         // sign extend???
 897         __ movl(r13, Address(saved_sp, ld_off));
 898         __ movptr(Address(rsp, st_off), r13);
 899       } else {
 900         //
 901         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 902         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 903         // So we must adjust where to pick up the data to match the interpreter.
 904         //
 905         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 906         // are accessed as negative so LSW is at LOW address
 907 
 908         // ld_off is MSW so get LSW
 909         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 910                            next_off : ld_off;
 911         __ movq(r13, Address(saved_sp, offset));
 912         // st_off is LSW (i.e. reg.first())
 913         __ movq(Address(rsp, st_off), r13);
 914       }
 915     } else if (r_1->is_Register()) {  // Register argument
 916       Register r = r_1->as_Register();
 917       assert(r != rax, "must be different");
 918       if (r_2->is_valid()) {
 919         //
 920         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 921         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 922         // So we must adjust where to pick up the data to match the interpreter.
 923 
 924         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 925                            next_off : ld_off;
 926 
 927         // this can be a misaligned move
 928         __ movq(r, Address(saved_sp, offset));
 929       } else {
 930         // sign extend and use a full word?
 931         __ movl(r, Address(saved_sp, ld_off));
 932       }
 933     } else {
 934       if (!r_2->is_valid()) {
 935         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 936       } else {
 937         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 938       }
 939     }
 940   }
 941 
 942   // 6243940 We might end up in handle_wrong_method if
 943   // the callee is deoptimized as we race thru here. If that
 944   // happens we don't want to take a safepoint because the
 945   // caller frame will look interpreted and arguments are now
 946   // "compiled" so it is much better to make this transition
 947   // invisible to the stack walking code. Unfortunately if
 948   // we try and find the callee by normal means a safepoint
 949   // is possible. So we stash the desired callee in the thread
 950   // and the vm will find there should this case occur.
 951 
 952   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 953 
 954   // put Method* where a c2i would expect should we end up there
 955   // only needed becaus eof c2 resolve stubs return Method* as a result in
 956   // rax
 957   __ mov(rax, rbx);
 958   __ jmp(r11);
 959 }
 960 
 961 // ---------------------------------------------------------------
 962 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 963                                                             int total_args_passed,
 964                                                             int comp_args_on_stack,
 965                                                             const BasicType *sig_bt,
 966                                                             const VMRegPair *regs,
 967                                                             AdapterFingerPrint* fingerprint) {
 968   address i2c_entry = __ pc();
 969 
 970   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 971 
 972   // -------------------------------------------------------------------------
 973   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 974   // to the interpreter.  The args start out packed in the compiled layout.  They
 975   // need to be unpacked into the interpreter layout.  This will almost always
 976   // require some stack space.  We grow the current (compiled) stack, then repack
 977   // the args.  We  finally end in a jump to the generic interpreter entry point.
 978   // On exit from the interpreter, the interpreter will restore our SP (lest the
 979   // compiled code, which relys solely on SP and not RBP, get sick).
 980 
 981   address c2i_unverified_entry = __ pc();
 982   Label skip_fixup;
 983   Label ok;
 984 
 985   Register holder = rax;
 986   Register receiver = j_rarg0;
 987   Register temp = rbx;
 988 
 989   {
 990     __ load_klass(temp, receiver, rscratch1);
 991     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
 992     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
 993     __ jcc(Assembler::equal, ok);
 994     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 995 
 996     __ bind(ok);
 997     // Method might have been compiled since the call site was patched to
 998     // interpreted if that is the case treat it as a miss so we can get
 999     // the call site corrected.
1000     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1001     __ jcc(Assembler::equal, skip_fixup);
1002     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1003   }
1004 
1005   address c2i_entry = __ pc();
1006 
1007   // Class initialization barrier for static methods
1008   address c2i_no_clinit_check_entry = NULL;
1009   if (VM_Version::supports_fast_class_init_checks()) {
1010     Label L_skip_barrier;
1011     Register method = rbx;
1012 
1013     { // Bypass the barrier for non-static methods
1014       Register flags  = rscratch1;
1015       __ movl(flags, Address(method, Method::access_flags_offset()));
1016       __ testl(flags, JVM_ACC_STATIC);
1017       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1018     }
1019 
1020     Register klass = rscratch1;
1021     __ load_method_holder(klass, method);
1022     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1023 
1024     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1025 
1026     __ bind(L_skip_barrier);
1027     c2i_no_clinit_check_entry = __ pc();
1028   }
1029 
1030   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1031   bs->c2i_entry_barrier(masm);
1032 
1033   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1034 
1035   __ flush();
1036   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1037 }
1038 
1039 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1040                                          VMRegPair *regs,
1041                                          VMRegPair *regs2,
1042                                          int total_args_passed) {
1043   assert(regs2 == NULL, "not needed on x86");
1044 // We return the amount of VMRegImpl stack slots we need to reserve for all
1045 // the arguments NOT counting out_preserve_stack_slots.
1046 
1047 // NOTE: These arrays will have to change when c1 is ported
1048 #ifdef _WIN64
1049     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1050       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1051     };
1052     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1053       c_farg0, c_farg1, c_farg2, c_farg3
1054     };
1055 #else
1056     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1057       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1058     };
1059     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1060       c_farg0, c_farg1, c_farg2, c_farg3,
1061       c_farg4, c_farg5, c_farg6, c_farg7
1062     };
1063 #endif // _WIN64
1064 
1065 
1066     uint int_args = 0;
1067     uint fp_args = 0;
1068     uint stk_args = 0; // inc by 2 each time
1069 
1070     for (int i = 0; i < total_args_passed; i++) {
1071       switch (sig_bt[i]) {
1072       case T_BOOLEAN:
1073       case T_CHAR:
1074       case T_BYTE:
1075       case T_SHORT:
1076       case T_INT:
1077         if (int_args < Argument::n_int_register_parameters_c) {
1078           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1079 #ifdef _WIN64
1080           fp_args++;
1081           // Allocate slots for callee to stuff register args the stack.
1082           stk_args += 2;
1083 #endif
1084         } else {
1085           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1086           stk_args += 2;
1087         }
1088         break;
1089       case T_LONG:
1090         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1091         // fall through
1092       case T_OBJECT:
1093       case T_ARRAY:
1094       case T_ADDRESS:
1095       case T_METADATA:
1096         if (int_args < Argument::n_int_register_parameters_c) {
1097           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1098 #ifdef _WIN64
1099           fp_args++;
1100           stk_args += 2;
1101 #endif
1102         } else {
1103           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1104           stk_args += 2;
1105         }
1106         break;
1107       case T_FLOAT:
1108         if (fp_args < Argument::n_float_register_parameters_c) {
1109           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1110 #ifdef _WIN64
1111           int_args++;
1112           // Allocate slots for callee to stuff register args the stack.
1113           stk_args += 2;
1114 #endif
1115         } else {
1116           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1117           stk_args += 2;
1118         }
1119         break;
1120       case T_DOUBLE:
1121         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1122         if (fp_args < Argument::n_float_register_parameters_c) {
1123           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1124 #ifdef _WIN64
1125           int_args++;
1126           // Allocate slots for callee to stuff register args the stack.
1127           stk_args += 2;
1128 #endif
1129         } else {
1130           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1131           stk_args += 2;
1132         }
1133         break;
1134       case T_VOID: // Halves of longs and doubles
1135         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1136         regs[i].set_bad();
1137         break;
1138       default:
1139         ShouldNotReachHere();
1140         break;
1141       }
1142     }
1143 #ifdef _WIN64
1144   // windows abi requires that we always allocate enough stack space
1145   // for 4 64bit registers to be stored down.
1146   if (stk_args < 8) {
1147     stk_args = 8;
1148   }
1149 #endif // _WIN64
1150 
1151   return stk_args;
1152 }
1153 
1154 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1155                                              uint num_bits,
1156                                              uint total_args_passed) {
1157   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1158          "only certain vector sizes are supported for now");
1159 
1160   static const XMMRegister VEC_ArgReg[32] = {
1161      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1162      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1163     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1164     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1165   };
1166 
1167   uint stk_args = 0;
1168   uint fp_args = 0;
1169 
1170   for (uint i = 0; i < total_args_passed; i++) {
1171     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1172     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1173     regs[i].set_pair(vmreg->next(next_val), vmreg);
1174   }
1175 
1176   return stk_args;
1177 }
1178 
1179 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1180   // We always ignore the frame_slots arg and just use the space just below frame pointer
1181   // which by this time is free to use
1182   switch (ret_type) {
1183   case T_FLOAT:
1184     __ movflt(Address(rbp, -wordSize), xmm0);
1185     break;
1186   case T_DOUBLE:
1187     __ movdbl(Address(rbp, -wordSize), xmm0);
1188     break;
1189   case T_VOID:  break;
1190   default: {
1191     __ movptr(Address(rbp, -wordSize), rax);
1192     }
1193   }
1194 }
1195 
1196 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1197   // We always ignore the frame_slots arg and just use the space just below frame pointer
1198   // which by this time is free to use
1199   switch (ret_type) {
1200   case T_FLOAT:
1201     __ movflt(xmm0, Address(rbp, -wordSize));
1202     break;
1203   case T_DOUBLE:
1204     __ movdbl(xmm0, Address(rbp, -wordSize));
1205     break;
1206   case T_VOID:  break;
1207   default: {
1208     __ movptr(rax, Address(rbp, -wordSize));
1209     }
1210   }
1211 }
1212 
1213 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1214     for ( int i = first_arg ; i < arg_count ; i++ ) {
1215       if (args[i].first()->is_Register()) {
1216         __ push(args[i].first()->as_Register());
1217       } else if (args[i].first()->is_XMMRegister()) {
1218         __ subptr(rsp, 2*wordSize);
1219         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1220       }
1221     }
1222 }
1223 
1224 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1225     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1226       if (args[i].first()->is_Register()) {
1227         __ pop(args[i].first()->as_Register());
1228       } else if (args[i].first()->is_XMMRegister()) {
1229         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1230         __ addptr(rsp, 2*wordSize);
1231       }
1232     }
1233 }
1234 
1235 // Unpack an array argument into a pointer to the body and the length
1236 // if the array is non-null, otherwise pass 0 for both.
1237 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
1238   Register tmp_reg = rax;
1239   assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
1240          "possible collision");
1241   assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
1242          "possible collision");
1243 
1244   __ block_comment("unpack_array_argument {");
1245 
1246   // Pass the length, ptr pair
1247   Label is_null, done;
1248   VMRegPair tmp;
1249   tmp.set_ptr(tmp_reg->as_VMReg());
1250   if (reg.first()->is_stack()) {
1251     // Load the arg up from the stack
1252     __ move_ptr(reg, tmp);
1253     reg = tmp;
1254   }
1255   __ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1256   __ jccb(Assembler::equal, is_null);
1257   __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1258   __ move_ptr(tmp, body_arg);
1259   // load the length relative to the body.
1260   __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() -
1261                            arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1262   __ move32_64(tmp, length_arg);
1263   __ jmpb(done);
1264   __ bind(is_null);
1265   // Pass zeros
1266   __ xorptr(tmp_reg, tmp_reg);
1267   __ move_ptr(tmp, body_arg);
1268   __ move32_64(tmp, length_arg);
1269   __ bind(done);
1270 
1271   __ block_comment("} unpack_array_argument");
1272 }
1273 
1274 
1275 // Different signatures may require very different orders for the move
1276 // to avoid clobbering other arguments.  There's no simple way to
1277 // order them safely.  Compute a safe order for issuing stores and
1278 // break any cycles in those stores.  This code is fairly general but
1279 // it's not necessary on the other platforms so we keep it in the
1280 // platform dependent code instead of moving it into a shared file.
1281 // (See bugs 7013347 & 7145024.)
1282 // Note that this code is specific to LP64.
1283 class ComputeMoveOrder: public StackObj {
1284   class MoveOperation: public ResourceObj {
1285     friend class ComputeMoveOrder;
1286    private:
1287     VMRegPair        _src;
1288     VMRegPair        _dst;
1289     int              _src_index;
1290     int              _dst_index;
1291     bool             _processed;
1292     MoveOperation*  _next;
1293     MoveOperation*  _prev;
1294 
1295     static int get_id(VMRegPair r) {
1296       return r.first()->value();
1297     }
1298 
1299    public:
1300     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1301       _src(src)
1302     , _dst(dst)
1303     , _src_index(src_index)
1304     , _dst_index(dst_index)
1305     , _processed(false)
1306     , _next(NULL)
1307     , _prev(NULL) {
1308     }
1309 
1310     VMRegPair src() const              { return _src; }
1311     int src_id() const                 { return get_id(src()); }
1312     int src_index() const              { return _src_index; }
1313     VMRegPair dst() const              { return _dst; }
1314     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1315     int dst_index() const              { return _dst_index; }
1316     int dst_id() const                 { return get_id(dst()); }
1317     MoveOperation* next() const       { return _next; }
1318     MoveOperation* prev() const       { return _prev; }
1319     void set_processed()               { _processed = true; }
1320     bool is_processed() const          { return _processed; }
1321 
1322     // insert
1323     void break_cycle(VMRegPair temp_register) {
1324       // create a new store following the last store
1325       // to move from the temp_register to the original
1326       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1327 
1328       // break the cycle of links and insert new_store at the end
1329       // break the reverse link.
1330       MoveOperation* p = prev();
1331       assert(p->next() == this, "must be");
1332       _prev = NULL;
1333       p->_next = new_store;
1334       new_store->_prev = p;
1335 
1336       // change the original store to save it's value in the temp.
1337       set_dst(-1, temp_register);
1338     }
1339 
1340     void link(GrowableArray<MoveOperation*>& killer) {
1341       // link this store in front the store that it depends on
1342       MoveOperation* n = killer.at_grow(src_id(), NULL);
1343       if (n != NULL) {
1344         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1345         _next = n;
1346         n->_prev = this;
1347       }
1348     }
1349   };
1350 
1351  private:
1352   GrowableArray<MoveOperation*> edges;
1353 
1354  public:
1355   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1356                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1357     // Move operations where the dest is the stack can all be
1358     // scheduled first since they can't interfere with the other moves.
1359     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1360       if (in_sig_bt[i] == T_ARRAY) {
1361         c_arg--;
1362         if (out_regs[c_arg].first()->is_stack() &&
1363             out_regs[c_arg + 1].first()->is_stack()) {
1364           arg_order.push(i);
1365           arg_order.push(c_arg);
1366         } else {
1367           if (out_regs[c_arg].first()->is_stack() ||
1368               in_regs[i].first() == out_regs[c_arg].first()) {
1369             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1370           } else {
1371             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1372           }
1373         }
1374       } else if (in_sig_bt[i] == T_VOID) {
1375         arg_order.push(i);
1376         arg_order.push(c_arg);
1377       } else {
1378         if (out_regs[c_arg].first()->is_stack() ||
1379             in_regs[i].first() == out_regs[c_arg].first()) {
1380           arg_order.push(i);
1381           arg_order.push(c_arg);
1382         } else {
1383           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1384         }
1385       }
1386     }
1387     // Break any cycles in the register moves and emit the in the
1388     // proper order.
1389     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1390     for (int i = 0; i < stores->length(); i++) {
1391       arg_order.push(stores->at(i)->src_index());
1392       arg_order.push(stores->at(i)->dst_index());
1393     }
1394  }
1395 
1396   // Collected all the move operations
1397   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1398     if (src.first() == dst.first()) return;
1399     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1400   }
1401 
1402   // Walk the edges breaking cycles between moves.  The result list
1403   // can be walked in order to produce the proper set of loads
1404   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1405     // Record which moves kill which values
1406     GrowableArray<MoveOperation*> killer;
1407     for (int i = 0; i < edges.length(); i++) {
1408       MoveOperation* s = edges.at(i);
1409       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1410       killer.at_put_grow(s->dst_id(), s, NULL);
1411     }
1412     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1413            "make sure temp isn't in the registers that are killed");
1414 
1415     // create links between loads and stores
1416     for (int i = 0; i < edges.length(); i++) {
1417       edges.at(i)->link(killer);
1418     }
1419 
1420     // at this point, all the move operations are chained together
1421     // in a doubly linked list.  Processing it backwards finds
1422     // the beginning of the chain, forwards finds the end.  If there's
1423     // a cycle it can be broken at any point,  so pick an edge and walk
1424     // backward until the list ends or we end where we started.
1425     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1426     for (int e = 0; e < edges.length(); e++) {
1427       MoveOperation* s = edges.at(e);
1428       if (!s->is_processed()) {
1429         MoveOperation* start = s;
1430         // search for the beginning of the chain or cycle
1431         while (start->prev() != NULL && start->prev() != s) {
1432           start = start->prev();
1433         }
1434         if (start->prev() == s) {
1435           start->break_cycle(temp_register);
1436         }
1437         // walk the chain forward inserting to store list
1438         while (start != NULL) {
1439           stores->append(start);
1440           start->set_processed();
1441           start = start->next();
1442         }
1443       }
1444     }
1445     return stores;
1446   }
1447 };
1448 
1449 static void verify_oop_args(MacroAssembler* masm,
1450                             const methodHandle& method,
1451                             const BasicType* sig_bt,
1452                             const VMRegPair* regs) {
1453   Register temp_reg = rbx;  // not part of any compiled calling seq
1454   if (VerifyOops) {
1455     for (int i = 0; i < method->size_of_parameters(); i++) {
1456       if (is_reference_type(sig_bt[i])) {
1457         VMReg r = regs[i].first();
1458         assert(r->is_valid(), "bad oop arg");
1459         if (r->is_stack()) {
1460           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1461           __ verify_oop(temp_reg);
1462         } else {
1463           __ verify_oop(r->as_Register());
1464         }
1465       }
1466     }
1467   }
1468 }
1469 
1470 static void gen_special_dispatch(MacroAssembler* masm,
1471                                  const methodHandle& method,
1472                                  const BasicType* sig_bt,
1473                                  const VMRegPair* regs) {
1474   verify_oop_args(masm, method, sig_bt, regs);
1475   vmIntrinsics::ID iid = method->intrinsic_id();
1476 
1477   // Now write the args into the outgoing interpreter space
1478   bool     has_receiver   = false;
1479   Register receiver_reg   = noreg;
1480   int      member_arg_pos = -1;
1481   Register member_reg     = noreg;
1482   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1483   if (ref_kind != 0) {
1484     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1485     member_reg = rbx;  // known to be free at this point
1486     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1487   } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1488     has_receiver = true;
1489   } else {
1490     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1491   }
1492 
1493   if (member_reg != noreg) {
1494     // Load the member_arg into register, if necessary.
1495     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1496     VMReg r = regs[member_arg_pos].first();
1497     if (r->is_stack()) {
1498       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1499     } else {
1500       // no data motion is needed
1501       member_reg = r->as_Register();
1502     }
1503   }
1504 
1505   if (has_receiver) {
1506     // Make sure the receiver is loaded into a register.
1507     assert(method->size_of_parameters() > 0, "oob");
1508     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1509     VMReg r = regs[0].first();
1510     assert(r->is_valid(), "bad receiver arg");
1511     if (r->is_stack()) {
1512       // Porting note:  This assumes that compiled calling conventions always
1513       // pass the receiver oop in a register.  If this is not true on some
1514       // platform, pick a temp and load the receiver from stack.
1515       fatal("receiver always in a register");
1516       receiver_reg = j_rarg0;  // known to be free at this point
1517       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1518     } else {
1519       // no data motion is needed
1520       receiver_reg = r->as_Register();
1521     }
1522   }
1523 
1524   // Figure out which address we are really jumping to:
1525   MethodHandles::generate_method_handle_dispatch(masm, iid,
1526                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1527 }
1528 
1529 // ---------------------------------------------------------------------------
1530 // Generate a native wrapper for a given method.  The method takes arguments
1531 // in the Java compiled code convention, marshals them to the native
1532 // convention (handlizes oops, etc), transitions to native, makes the call,
1533 // returns to java state (possibly blocking), unhandlizes any result and
1534 // returns.
1535 //
1536 // Critical native functions are a shorthand for the use of
1537 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1538 // functions.  The wrapper is expected to unpack the arguments before
1539 // passing them to the callee. Critical native functions leave the state _in_Java,
1540 // since they cannot stop for GC.
1541 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1542 // block and the check for pending exceptions it's impossible for them
1543 // to be thrown.
1544 //
1545 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1546                                                 const methodHandle& method,
1547                                                 int compile_id,
1548                                                 BasicType* in_sig_bt,
1549                                                 VMRegPair* in_regs,
1550                                                 BasicType ret_type,
1551                                                 address critical_entry) {
1552   if (method->is_method_handle_intrinsic()) {
1553     vmIntrinsics::ID iid = method->intrinsic_id();
1554     intptr_t start = (intptr_t)__ pc();
1555     int vep_offset = ((intptr_t)__ pc()) - start;
1556     gen_special_dispatch(masm,
1557                          method,
1558                          in_sig_bt,
1559                          in_regs);
1560     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1561     __ flush();
1562     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1563     return nmethod::new_native_nmethod(method,
1564                                        compile_id,
1565                                        masm->code(),
1566                                        vep_offset,
1567                                        frame_complete,
1568                                        stack_slots / VMRegImpl::slots_per_word,
1569                                        in_ByteSize(-1),
1570                                        in_ByteSize(-1),
1571                                        (OopMapSet*)NULL);
1572   }
1573   bool is_critical_native = true;
1574   address native_func = critical_entry;
1575   if (native_func == NULL) {
1576     native_func = method->native_function();
1577     is_critical_native = false;
1578   }
1579   assert(native_func != NULL, "must have function");
1580 
1581   // An OopMap for lock (and class if static)
1582   OopMapSet *oop_maps = new OopMapSet();
1583   intptr_t start = (intptr_t)__ pc();
1584 
1585   // We have received a description of where all the java arg are located
1586   // on entry to the wrapper. We need to convert these args to where
1587   // the jni function will expect them. To figure out where they go
1588   // we convert the java signature to a C signature by inserting
1589   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1590 
1591   const int total_in_args = method->size_of_parameters();
1592   int total_c_args = total_in_args;
1593   if (!is_critical_native) {
1594     total_c_args += 1;
1595     if (method->is_static()) {
1596       total_c_args++;
1597     }
1598   } else {
1599     for (int i = 0; i < total_in_args; i++) {
1600       if (in_sig_bt[i] == T_ARRAY) {
1601         total_c_args++;
1602       }
1603     }
1604   }
1605 
1606   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1607   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1608   BasicType* in_elem_bt = NULL;
1609 
1610   int argc = 0;
1611   if (!is_critical_native) {
1612     out_sig_bt[argc++] = T_ADDRESS;
1613     if (method->is_static()) {
1614       out_sig_bt[argc++] = T_OBJECT;
1615     }
1616 
1617     for (int i = 0; i < total_in_args ; i++ ) {
1618       out_sig_bt[argc++] = in_sig_bt[i];
1619     }
1620   } else {
1621     in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
1622     SignatureStream ss(method->signature());
1623     for (int i = 0; i < total_in_args ; i++ ) {
1624       if (in_sig_bt[i] == T_ARRAY) {
1625         // Arrays are passed as int, elem* pair
1626         out_sig_bt[argc++] = T_INT;
1627         out_sig_bt[argc++] = T_ADDRESS;
1628         ss.skip_array_prefix(1);  // skip one '['
1629         assert(ss.is_primitive(), "primitive type expected");
1630         in_elem_bt[i] = ss.type();
1631       } else {
1632         out_sig_bt[argc++] = in_sig_bt[i];
1633         in_elem_bt[i] = T_VOID;
1634       }
1635       if (in_sig_bt[i] != T_VOID) {
1636         assert(in_sig_bt[i] == ss.type() ||
1637                in_sig_bt[i] == T_ARRAY, "must match");
1638         ss.next();
1639       }
1640     }
1641   }
1642 
1643   // Now figure out where the args must be stored and how much stack space
1644   // they require.
1645   int out_arg_slots;
1646   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1647 
1648   // Compute framesize for the wrapper.  We need to handlize all oops in
1649   // incoming registers
1650 
1651   // Calculate the total number of stack slots we will need.
1652 
1653   // First count the abi requirement plus all of the outgoing args
1654   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1655 
1656   // Now the space for the inbound oop handle area
1657   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1658   if (is_critical_native) {
1659     // Critical natives may have to call out so they need a save area
1660     // for register arguments.
1661     int double_slots = 0;
1662     int single_slots = 0;
1663     for ( int i = 0; i < total_in_args; i++) {
1664       if (in_regs[i].first()->is_Register()) {
1665         const Register reg = in_regs[i].first()->as_Register();
1666         switch (in_sig_bt[i]) {
1667           case T_BOOLEAN:
1668           case T_BYTE:
1669           case T_SHORT:
1670           case T_CHAR:
1671           case T_INT:  single_slots++; break;
1672           case T_ARRAY:  // specific to LP64 (7145024)
1673           case T_LONG: double_slots++; break;
1674           default:  ShouldNotReachHere();
1675         }
1676       } else if (in_regs[i].first()->is_XMMRegister()) {
1677         switch (in_sig_bt[i]) {
1678           case T_FLOAT:  single_slots++; break;
1679           case T_DOUBLE: double_slots++; break;
1680           default:  ShouldNotReachHere();
1681         }
1682       } else if (in_regs[i].first()->is_FloatRegister()) {
1683         ShouldNotReachHere();
1684       }
1685     }
1686     total_save_slots = double_slots * 2 + single_slots;
1687     // align the save area
1688     if (double_slots != 0) {
1689       stack_slots = align_up(stack_slots, 2);
1690     }
1691   }
1692 
1693   int oop_handle_offset = stack_slots;
1694   stack_slots += total_save_slots;
1695 
1696   // Now any space we need for handlizing a klass if static method
1697 
1698   int klass_slot_offset = 0;
1699   int klass_offset = -1;
1700   int lock_slot_offset = 0;
1701   bool is_static = false;
1702 
1703   if (method->is_static()) {
1704     klass_slot_offset = stack_slots;
1705     stack_slots += VMRegImpl::slots_per_word;
1706     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1707     is_static = true;
1708   }
1709 
1710   // Plus a lock if needed
1711 
1712   if (method->is_synchronized()) {
1713     lock_slot_offset = stack_slots;
1714     stack_slots += VMRegImpl::slots_per_word;
1715   }
1716 
1717   // Now a place (+2) to save return values or temp during shuffling
1718   // + 4 for return address (which we own) and saved rbp
1719   stack_slots += 6;
1720 
1721   // Ok The space we have allocated will look like:
1722   //
1723   //
1724   // FP-> |                     |
1725   //      |---------------------|
1726   //      | 2 slots for moves   |
1727   //      |---------------------|
1728   //      | lock box (if sync)  |
1729   //      |---------------------| <- lock_slot_offset
1730   //      | klass (if static)   |
1731   //      |---------------------| <- klass_slot_offset
1732   //      | oopHandle area      |
1733   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1734   //      | outbound memory     |
1735   //      | based arguments     |
1736   //      |                     |
1737   //      |---------------------|
1738   //      |                     |
1739   // SP-> | out_preserved_slots |
1740   //
1741   //
1742 
1743 
1744   // Now compute actual number of stack words we need rounding to make
1745   // stack properly aligned.
1746   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1747 
1748   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1749 
1750   // First thing make an ic check to see if we should even be here
1751 
1752   // We are free to use all registers as temps without saving them and
1753   // restoring them except rbp. rbp is the only callee save register
1754   // as far as the interpreter and the compiler(s) are concerned.
1755 
1756 
1757   const Register ic_reg = rax;
1758   const Register receiver = j_rarg0;
1759 
1760   Label hit;
1761   Label exception_pending;
1762 
1763   assert_different_registers(ic_reg, receiver, rscratch1);
1764   __ verify_oop(receiver);
1765   __ load_klass(rscratch1, receiver, rscratch2);
1766   __ cmpq(ic_reg, rscratch1);
1767   __ jcc(Assembler::equal, hit);
1768 
1769   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1770 
1771   // Verified entry point must be aligned
1772   __ align(8);
1773 
1774   __ bind(hit);
1775 
1776   int vep_offset = ((intptr_t)__ pc()) - start;
1777 
1778   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1779     Label L_skip_barrier;
1780     Register klass = r10;
1781     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1782     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1783 
1784     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1785 
1786     __ bind(L_skip_barrier);
1787   }
1788 
1789 #ifdef COMPILER1
1790   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1791   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1792     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1793   }
1794 #endif // COMPILER1
1795 
1796   // The instruction at the verified entry point must be 5 bytes or longer
1797   // because it can be patched on the fly by make_non_entrant. The stack bang
1798   // instruction fits that requirement.
1799 
1800   // Generate stack overflow check
1801   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1802 
1803   // Generate a new frame for the wrapper.
1804   __ enter();
1805   // -2 because return address is already present and so is saved rbp
1806   __ subptr(rsp, stack_size - 2*wordSize);
1807 
1808   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1809   bs->nmethod_entry_barrier(masm);
1810 
1811   // Frame is now completed as far as size and linkage.
1812   int frame_complete = ((intptr_t)__ pc()) - start;
1813 
1814     if (UseRTMLocking) {
1815       // Abort RTM transaction before calling JNI
1816       // because critical section will be large and will be
1817       // aborted anyway. Also nmethod could be deoptimized.
1818       __ xabort(0);
1819     }
1820 
1821 #ifdef ASSERT
1822     {
1823       Label L;
1824       __ mov(rax, rsp);
1825       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1826       __ cmpptr(rax, rsp);
1827       __ jcc(Assembler::equal, L);
1828       __ stop("improperly aligned stack");
1829       __ bind(L);
1830     }
1831 #endif /* ASSERT */
1832 
1833 
1834   // We use r14 as the oop handle for the receiver/klass
1835   // It is callee save so it survives the call to native
1836 
1837   const Register oop_handle_reg = r14;
1838 
1839   //
1840   // We immediately shuffle the arguments so that any vm call we have to
1841   // make from here on out (sync slow path, jvmti, etc.) we will have
1842   // captured the oops from our caller and have a valid oopMap for
1843   // them.
1844 
1845   // -----------------
1846   // The Grand Shuffle
1847 
1848   // The Java calling convention is either equal (linux) or denser (win64) than the
1849   // c calling convention. However the because of the jni_env argument the c calling
1850   // convention always has at least one more (and two for static) arguments than Java.
1851   // Therefore if we move the args from java -> c backwards then we will never have
1852   // a register->register conflict and we don't have to build a dependency graph
1853   // and figure out how to break any cycles.
1854   //
1855 
1856   // Record esp-based slot for receiver on stack for non-static methods
1857   int receiver_offset = -1;
1858 
1859   // This is a trick. We double the stack slots so we can claim
1860   // the oops in the caller's frame. Since we are sure to have
1861   // more args than the caller doubling is enough to make
1862   // sure we can capture all the incoming oop args from the
1863   // caller.
1864   //
1865   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1866 
1867   // Mark location of rbp (someday)
1868   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1869 
1870   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1871   // All inbound args are referenced based on rbp and all outbound args via rsp.
1872 
1873 
1874 #ifdef ASSERT
1875   bool reg_destroyed[RegisterImpl::number_of_registers];
1876   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1877   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1878     reg_destroyed[r] = false;
1879   }
1880   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1881     freg_destroyed[f] = false;
1882   }
1883 
1884 #endif /* ASSERT */
1885 
1886   // This may iterate in two different directions depending on the
1887   // kind of native it is.  The reason is that for regular JNI natives
1888   // the incoming and outgoing registers are offset upwards and for
1889   // critical natives they are offset down.
1890   GrowableArray<int> arg_order(2 * total_in_args);
1891 
1892   VMRegPair tmp_vmreg;
1893   tmp_vmreg.set2(rbx->as_VMReg());
1894 
1895   if (!is_critical_native) {
1896     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1897       arg_order.push(i);
1898       arg_order.push(c_arg);
1899     }
1900   } else {
1901     // Compute a valid move order, using tmp_vmreg to break any cycles
1902     ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
1903   }
1904 
1905   int temploc = -1;
1906   for (int ai = 0; ai < arg_order.length(); ai += 2) {
1907     int i = arg_order.at(ai);
1908     int c_arg = arg_order.at(ai + 1);
1909     __ block_comment(err_msg("move %d -> %d", i, c_arg));
1910     if (c_arg == -1) {
1911       assert(is_critical_native, "should only be required for critical natives");
1912       // This arg needs to be moved to a temporary
1913       __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
1914       in_regs[i] = tmp_vmreg;
1915       temploc = i;
1916       continue;
1917     } else if (i == -1) {
1918       assert(is_critical_native, "should only be required for critical natives");
1919       // Read from the temporary location
1920       assert(temploc != -1, "must be valid");
1921       i = temploc;
1922       temploc = -1;
1923     }
1924 #ifdef ASSERT
1925     if (in_regs[i].first()->is_Register()) {
1926       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1927     } else if (in_regs[i].first()->is_XMMRegister()) {
1928       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1929     }
1930     if (out_regs[c_arg].first()->is_Register()) {
1931       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1932     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1933       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1934     }
1935 #endif /* ASSERT */
1936     switch (in_sig_bt[i]) {
1937       case T_ARRAY:
1938         if (is_critical_native) {
1939           unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
1940           c_arg++;
1941 #ifdef ASSERT
1942           if (out_regs[c_arg].first()->is_Register()) {
1943             reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1944           } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1945             freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1946           }
1947 #endif
1948           break;
1949         }
1950       case T_OBJECT:
1951         assert(!is_critical_native, "no oop arguments");
1952         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1953                     ((i == 0) && (!is_static)),
1954                     &receiver_offset);
1955         break;
1956       case T_VOID:
1957         break;
1958 
1959       case T_FLOAT:
1960         __ float_move(in_regs[i], out_regs[c_arg]);
1961           break;
1962 
1963       case T_DOUBLE:
1964         assert( i + 1 < total_in_args &&
1965                 in_sig_bt[i + 1] == T_VOID &&
1966                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1967         __ double_move(in_regs[i], out_regs[c_arg]);
1968         break;
1969 
1970       case T_LONG :
1971         __ long_move(in_regs[i], out_regs[c_arg]);
1972         break;
1973 
1974       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1975 
1976       default:
1977         __ move32_64(in_regs[i], out_regs[c_arg]);
1978     }
1979   }
1980 
1981   int c_arg;
1982 
1983   // Pre-load a static method's oop into r14.  Used both by locking code and
1984   // the normal JNI call code.
1985   if (!is_critical_native) {
1986     // point c_arg at the first arg that is already loaded in case we
1987     // need to spill before we call out
1988     c_arg = total_c_args - total_in_args;
1989 
1990     if (method->is_static()) {
1991 
1992       //  load oop into a register
1993       __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1994 
1995       // Now handlize the static class mirror it's known not-null.
1996       __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1997       map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1998 
1999       // Now get the handle
2000       __ lea(oop_handle_reg, Address(rsp, klass_offset));
2001       // store the klass handle as second argument
2002       __ movptr(c_rarg1, oop_handle_reg);
2003       // and protect the arg if we must spill
2004       c_arg--;
2005     }
2006   } else {
2007     // For JNI critical methods we need to save all registers in save_args.
2008     c_arg = 0;
2009   }
2010 
2011   // Change state to native (we save the return address in the thread, since it might not
2012   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2013   // points into the right code segment. It does not have to be the correct return pc.
2014   // We use the same pc/oopMap repeatedly when we call out
2015 
2016   intptr_t the_pc = (intptr_t) __ pc();
2017   oop_maps->add_gc_map(the_pc - start, map);
2018 
2019   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
2020 
2021 
2022   // We have all of the arguments setup at this point. We must not touch any register
2023   // argument registers at this point (what if we save/restore them there are no oop?
2024 
2025   {
2026     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2027     // protect the args we've loaded
2028     save_args(masm, total_c_args, c_arg, out_regs);
2029     __ mov_metadata(c_rarg1, method());
2030     __ call_VM_leaf(
2031       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2032       r15_thread, c_rarg1);
2033     restore_args(masm, total_c_args, c_arg, out_regs);
2034   }
2035 
2036   // RedefineClasses() tracing support for obsolete method entry
2037   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2038     // protect the args we've loaded
2039     save_args(masm, total_c_args, c_arg, out_regs);
2040     __ mov_metadata(c_rarg1, method());
2041     __ call_VM_leaf(
2042       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2043       r15_thread, c_rarg1);
2044     restore_args(masm, total_c_args, c_arg, out_regs);
2045   }
2046 
2047   // Lock a synchronized method
2048 
2049   // Register definitions used by locking and unlocking
2050 
2051   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2052   const Register obj_reg  = rbx;  // Will contain the oop
2053   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2054   const Register old_hdr  = r13;  // value of old header at unlock time
2055 
2056   Label slow_path_lock;
2057   Label lock_done;
2058 
2059   if (method->is_synchronized()) {
2060     assert(!is_critical_native, "unhandled");
2061 
2062 
2063     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2064 
2065     // Get the handle (the 2nd argument)
2066     __ mov(oop_handle_reg, c_rarg1);
2067 
2068     // Get address of the box
2069 
2070     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2071 
2072     // Load the oop from the handle
2073     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2074 
2075     if (UseBiasedLocking) {
2076       __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, rscratch2, false, lock_done, &slow_path_lock);
2077     }
2078 
2079     // Load immediate 1 into swap_reg %rax
2080     __ movl(swap_reg, 1);
2081 
2082     // Load (object->mark() | 1) into swap_reg %rax
2083     __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2084 
2085     // Save (object->mark() | 1) into BasicLock's displaced header
2086     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2087 
2088     // src -> dest iff dest == rax else rax <- dest
2089     __ lock();
2090     __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2091     __ jcc(Assembler::equal, lock_done);
2092 
2093     // Hmm should this move to the slow path code area???
2094 
2095     // Test if the oopMark is an obvious stack pointer, i.e.,
2096     //  1) (mark & 3) == 0, and
2097     //  2) rsp <= mark < mark + os::pagesize()
2098     // These 3 tests can be done by evaluating the following
2099     // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2100     // assuming both stack pointer and pagesize have their
2101     // least significant 2 bits clear.
2102     // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2103 
2104     __ subptr(swap_reg, rsp);
2105     __ andptr(swap_reg, 3 - os::vm_page_size());
2106 
2107     // Save the test result, for recursive case, the result is zero
2108     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2109     __ jcc(Assembler::notEqual, slow_path_lock);
2110 
2111     // Slow path will re-enter here
2112 
2113     __ bind(lock_done);
2114   }
2115 
2116   // Finally just about ready to make the JNI call
2117 
2118   // get JNIEnv* which is first argument to native
2119   if (!is_critical_native) {
2120     __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2121 
2122     // Now set thread in native
2123     __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2124   }
2125 
2126   __ call(RuntimeAddress(native_func));
2127 
2128   // Verify or restore cpu control state after JNI call
2129   __ restore_cpu_control_state_after_jni();
2130 
2131   // Unpack native results.
2132   switch (ret_type) {
2133   case T_BOOLEAN: __ c2bool(rax);            break;
2134   case T_CHAR   : __ movzwl(rax, rax);      break;
2135   case T_BYTE   : __ sign_extend_byte (rax); break;
2136   case T_SHORT  : __ sign_extend_short(rax); break;
2137   case T_INT    : /* nothing to do */        break;
2138   case T_DOUBLE :
2139   case T_FLOAT  :
2140     // Result is in xmm0 we'll save as needed
2141     break;
2142   case T_ARRAY:                 // Really a handle
2143   case T_OBJECT:                // Really a handle
2144       break; // can't de-handlize until after safepoint check
2145   case T_VOID: break;
2146   case T_LONG: break;
2147   default       : ShouldNotReachHere();
2148   }
2149 
2150   Label after_transition;
2151 
2152   // If this is a critical native, check for a safepoint or suspend request after the call.
2153   // If a safepoint is needed, transition to native, then to native_trans to handle
2154   // safepoints like the native methods that are not critical natives.
2155   if (is_critical_native) {
2156     Label needs_safepoint;
2157     __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */);
2158     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2159     __ jcc(Assembler::equal, after_transition);
2160     __ bind(needs_safepoint);
2161   }
2162 
2163   // Switch thread to "native transition" state before reading the synchronization state.
2164   // This additional state is necessary because reading and testing the synchronization
2165   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2166   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2167   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2168   //     Thread A is resumed to finish this native method, but doesn't block here since it
2169   //     didn't see any synchronization is progress, and escapes.
2170   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2171 
2172   // Force this write out before the read below
2173   __ membar(Assembler::Membar_mask_bits(
2174               Assembler::LoadLoad | Assembler::LoadStore |
2175               Assembler::StoreLoad | Assembler::StoreStore));
2176 
2177   // check for safepoint operation in progress and/or pending suspend requests
2178   {
2179     Label Continue;
2180     Label slow_path;
2181 
2182     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2183 
2184     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2185     __ jcc(Assembler::equal, Continue);
2186     __ bind(slow_path);
2187 
2188     // Don't use call_VM as it will see a possible pending exception and forward it
2189     // and never return here preventing us from clearing _last_native_pc down below.
2190     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2191     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2192     // by hand.
2193     //
2194     __ vzeroupper();
2195     save_native_result(masm, ret_type, stack_slots);
2196     __ mov(c_rarg0, r15_thread);
2197     __ mov(r12, rsp); // remember sp
2198     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2199     __ andptr(rsp, -16); // align stack as required by ABI
2200     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2201     __ mov(rsp, r12); // restore sp
2202     __ reinit_heapbase();
2203     // Restore any method result value
2204     restore_native_result(masm, ret_type, stack_slots);
2205     __ bind(Continue);
2206   }
2207 
2208   // change thread state
2209   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2210   __ bind(after_transition);
2211 
2212   Label reguard;
2213   Label reguard_done;
2214   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2215   __ jcc(Assembler::equal, reguard);
2216   __ bind(reguard_done);
2217 
2218   // native result if any is live
2219 
2220   // Unlock
2221   Label unlock_done;
2222   Label slow_path_unlock;
2223   if (method->is_synchronized()) {
2224 
2225     // Get locked oop from the handle we passed to jni
2226     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2227 
2228     Label done;
2229 
2230     if (UseBiasedLocking) {
2231       __ biased_locking_exit(obj_reg, old_hdr, done);
2232     }
2233 
2234     // Simple recursive lock?
2235 
2236     __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2237     __ jcc(Assembler::equal, done);
2238 
2239     // Must save rax if if it is live now because cmpxchg must use it
2240     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2241       save_native_result(masm, ret_type, stack_slots);
2242     }
2243 
2244 
2245     // get address of the stack lock
2246     __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2247     //  get old displaced header
2248     __ movptr(old_hdr, Address(rax, 0));
2249 
2250     // Atomic swap old header if oop still contains the stack lock
2251     __ lock();
2252     __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2253     __ jcc(Assembler::notEqual, slow_path_unlock);
2254 
2255     // slow path re-enters here
2256     __ bind(unlock_done);
2257     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2258       restore_native_result(masm, ret_type, stack_slots);
2259     }
2260 
2261     __ bind(done);
2262 
2263   }
2264   {
2265     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2266     save_native_result(masm, ret_type, stack_slots);
2267     __ mov_metadata(c_rarg1, method());
2268     __ call_VM_leaf(
2269          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2270          r15_thread, c_rarg1);
2271     restore_native_result(masm, ret_type, stack_slots);
2272   }
2273 
2274   __ reset_last_Java_frame(false);
2275 
2276   // Unbox oop result, e.g. JNIHandles::resolve value.
2277   if (is_reference_type(ret_type)) {
2278     __ resolve_jobject(rax /* value */,
2279                        r15_thread /* thread */,
2280                        rcx /* tmp */);
2281   }
2282 
2283   if (CheckJNICalls) {
2284     // clear_pending_jni_exception_check
2285     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2286   }
2287 
2288   if (!is_critical_native) {
2289     // reset handle block
2290     __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2291     __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2292   }
2293 
2294   // pop our frame
2295 
2296   __ leave();
2297 
2298   if (!is_critical_native) {
2299     // Any exception pending?
2300     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2301     __ jcc(Assembler::notEqual, exception_pending);
2302   }
2303 
2304   // Return
2305 
2306   __ ret(0);
2307 
2308   // Unexpected paths are out of line and go here
2309 
2310   if (!is_critical_native) {
2311     // forward the exception
2312     __ bind(exception_pending);
2313 
2314     // and forward the exception
2315     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2316   }
2317 
2318   // Slow path locking & unlocking
2319   if (method->is_synchronized()) {
2320 
2321     // BEGIN Slow path lock
2322     __ bind(slow_path_lock);
2323 
2324     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2325     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2326 
2327     // protect the args we've loaded
2328     save_args(masm, total_c_args, c_arg, out_regs);
2329 
2330     __ mov(c_rarg0, obj_reg);
2331     __ mov(c_rarg1, lock_reg);
2332     __ mov(c_rarg2, r15_thread);
2333 
2334     // Not a leaf but we have last_Java_frame setup as we want
2335     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2336     restore_args(masm, total_c_args, c_arg, out_regs);
2337 
2338 #ifdef ASSERT
2339     { Label L;
2340     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2341     __ jcc(Assembler::equal, L);
2342     __ stop("no pending exception allowed on exit from monitorenter");
2343     __ bind(L);
2344     }
2345 #endif
2346     __ jmp(lock_done);
2347 
2348     // END Slow path lock
2349 
2350     // BEGIN Slow path unlock
2351     __ bind(slow_path_unlock);
2352 
2353     // If we haven't already saved the native result we must save it now as xmm registers
2354     // are still exposed.
2355     __ vzeroupper();
2356     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2357       save_native_result(masm, ret_type, stack_slots);
2358     }
2359 
2360     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2361 
2362     __ mov(c_rarg0, obj_reg);
2363     __ mov(c_rarg2, r15_thread);
2364     __ mov(r12, rsp); // remember sp
2365     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2366     __ andptr(rsp, -16); // align stack as required by ABI
2367 
2368     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2369     // NOTE that obj_reg == rbx currently
2370     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2371     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2372 
2373     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2374     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2375     __ mov(rsp, r12); // restore sp
2376     __ reinit_heapbase();
2377 #ifdef ASSERT
2378     {
2379       Label L;
2380       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2381       __ jcc(Assembler::equal, L);
2382       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2383       __ bind(L);
2384     }
2385 #endif /* ASSERT */
2386 
2387     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2388 
2389     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2390       restore_native_result(masm, ret_type, stack_slots);
2391     }
2392     __ jmp(unlock_done);
2393 
2394     // END Slow path unlock
2395 
2396   } // synchronized
2397 
2398   // SLOW PATH Reguard the stack if needed
2399 
2400   __ bind(reguard);
2401   __ vzeroupper();
2402   save_native_result(masm, ret_type, stack_slots);
2403   __ mov(r12, rsp); // remember sp
2404   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2405   __ andptr(rsp, -16); // align stack as required by ABI
2406   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2407   __ mov(rsp, r12); // restore sp
2408   __ reinit_heapbase();
2409   restore_native_result(masm, ret_type, stack_slots);
2410   // and continue
2411   __ jmp(reguard_done);
2412 
2413 
2414 
2415   __ flush();
2416 
2417   nmethod *nm = nmethod::new_native_nmethod(method,
2418                                             compile_id,
2419                                             masm->code(),
2420                                             vep_offset,
2421                                             frame_complete,
2422                                             stack_slots / VMRegImpl::slots_per_word,
2423                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2424                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2425                                             oop_maps);
2426 
2427   return nm;
2428 }
2429 
2430 // this function returns the adjust size (in number of words) to a c2i adapter
2431 // activation for use during deoptimization
2432 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2433   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2434 }
2435 
2436 
2437 uint SharedRuntime::out_preserve_stack_slots() {
2438   return 0;
2439 }
2440 
2441 
2442 // Number of stack slots between incoming argument block and the start of
2443 // a new frame.  The PROLOG must add this many slots to the stack.  The
2444 // EPILOG must remove this many slots.  amd64 needs two slots for
2445 // return address.
2446 uint SharedRuntime::in_preserve_stack_slots() {
2447   return 4 + 2 * VerifyStackAtCalls;
2448 }
2449 
2450 //------------------------------generate_deopt_blob----------------------------
2451 void SharedRuntime::generate_deopt_blob() {
2452   // Allocate space for the code
2453   ResourceMark rm;
2454   // Setup code generation tools
2455   int pad = 0;
2456   if (UseAVX > 2) {
2457     pad += 1024;
2458   }
2459 #if INCLUDE_JVMCI
2460   if (EnableJVMCI) {
2461     pad += 512; // Increase the buffer size when compiling for JVMCI
2462   }
2463 #endif
2464   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2465   MacroAssembler* masm = new MacroAssembler(&buffer);
2466   int frame_size_in_words;
2467   OopMap* map = NULL;
2468   OopMapSet *oop_maps = new OopMapSet();
2469 
2470   // -------------
2471   // This code enters when returning to a de-optimized nmethod.  A return
2472   // address has been pushed on the the stack, and return values are in
2473   // registers.
2474   // If we are doing a normal deopt then we were called from the patched
2475   // nmethod from the point we returned to the nmethod. So the return
2476   // address on the stack is wrong by NativeCall::instruction_size
2477   // We will adjust the value so it looks like we have the original return
2478   // address on the stack (like when we eagerly deoptimized).
2479   // In the case of an exception pending when deoptimizing, we enter
2480   // with a return address on the stack that points after the call we patched
2481   // into the exception handler. We have the following register state from,
2482   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2483   //    rax: exception oop
2484   //    rbx: exception handler
2485   //    rdx: throwing pc
2486   // So in this case we simply jam rdx into the useless return address and
2487   // the stack looks just like we want.
2488   //
2489   // At this point we need to de-opt.  We save the argument return
2490   // registers.  We call the first C routine, fetch_unroll_info().  This
2491   // routine captures the return values and returns a structure which
2492   // describes the current frame size and the sizes of all replacement frames.
2493   // The current frame is compiled code and may contain many inlined
2494   // functions, each with their own JVM state.  We pop the current frame, then
2495   // push all the new frames.  Then we call the C routine unpack_frames() to
2496   // populate these frames.  Finally unpack_frames() returns us the new target
2497   // address.  Notice that callee-save registers are BLOWN here; they have
2498   // already been captured in the vframeArray at the time the return PC was
2499   // patched.
2500   address start = __ pc();
2501   Label cont;
2502 
2503   // Prolog for non exception case!
2504 
2505   // Save everything in sight.
2506   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2507 
2508   // Normal deoptimization.  Save exec mode for unpack_frames.
2509   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2510   __ jmp(cont);
2511 
2512   int reexecute_offset = __ pc() - start;
2513 #if INCLUDE_JVMCI && !defined(COMPILER1)
2514   if (EnableJVMCI && UseJVMCICompiler) {
2515     // JVMCI does not use this kind of deoptimization
2516     __ should_not_reach_here();
2517   }
2518 #endif
2519 
2520   // Reexecute case
2521   // return address is the pc describes what bci to do re-execute at
2522 
2523   // No need to update map as each call to save_live_registers will produce identical oopmap
2524   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2525 
2526   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2527   __ jmp(cont);
2528 
2529 #if INCLUDE_JVMCI
2530   Label after_fetch_unroll_info_call;
2531   int implicit_exception_uncommon_trap_offset = 0;
2532   int uncommon_trap_offset = 0;
2533 
2534   if (EnableJVMCI) {
2535     implicit_exception_uncommon_trap_offset = __ pc() - start;
2536 
2537     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2538     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2539 
2540     uncommon_trap_offset = __ pc() - start;
2541 
2542     // Save everything in sight.
2543     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2544     // fetch_unroll_info needs to call last_java_frame()
2545     __ set_last_Java_frame(noreg, noreg, NULL);
2546 
2547     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2548     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2549 
2550     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2551     __ mov(c_rarg0, r15_thread);
2552     __ movl(c_rarg2, r14); // exec mode
2553     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2554     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2555 
2556     __ reset_last_Java_frame(false);
2557 
2558     __ jmp(after_fetch_unroll_info_call);
2559   } // EnableJVMCI
2560 #endif // INCLUDE_JVMCI
2561 
2562   int exception_offset = __ pc() - start;
2563 
2564   // Prolog for exception case
2565 
2566   // all registers are dead at this entry point, except for rax, and
2567   // rdx which contain the exception oop and exception pc
2568   // respectively.  Set them in TLS and fall thru to the
2569   // unpack_with_exception_in_tls entry point.
2570 
2571   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2572   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2573 
2574   int exception_in_tls_offset = __ pc() - start;
2575 
2576   // new implementation because exception oop is now passed in JavaThread
2577 
2578   // Prolog for exception case
2579   // All registers must be preserved because they might be used by LinearScan
2580   // Exceptiop oop and throwing PC are passed in JavaThread
2581   // tos: stack at point of call to method that threw the exception (i.e. only
2582   // args are on the stack, no return address)
2583 
2584   // make room on stack for the return address
2585   // It will be patched later with the throwing pc. The correct value is not
2586   // available now because loading it from memory would destroy registers.
2587   __ push(0);
2588 
2589   // Save everything in sight.
2590   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2591 
2592   // Now it is safe to overwrite any register
2593 
2594   // Deopt during an exception.  Save exec mode for unpack_frames.
2595   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2596 
2597   // load throwing pc from JavaThread and patch it as the return address
2598   // of the current frame. Then clear the field in JavaThread
2599 
2600   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2601   __ movptr(Address(rbp, wordSize), rdx);
2602   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2603 
2604 #ifdef ASSERT
2605   // verify that there is really an exception oop in JavaThread
2606   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2607   __ verify_oop(rax);
2608 
2609   // verify that there is no pending exception
2610   Label no_pending_exception;
2611   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2612   __ testptr(rax, rax);
2613   __ jcc(Assembler::zero, no_pending_exception);
2614   __ stop("must not have pending exception here");
2615   __ bind(no_pending_exception);
2616 #endif
2617 
2618   __ bind(cont);
2619 
2620   // Call C code.  Need thread and this frame, but NOT official VM entry
2621   // crud.  We cannot block on this call, no GC can happen.
2622   //
2623   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2624 
2625   // fetch_unroll_info needs to call last_java_frame().
2626 
2627   __ set_last_Java_frame(noreg, noreg, NULL);
2628 #ifdef ASSERT
2629   { Label L;
2630     __ cmpptr(Address(r15_thread,
2631                     JavaThread::last_Java_fp_offset()),
2632             (int32_t)0);
2633     __ jcc(Assembler::equal, L);
2634     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2635     __ bind(L);
2636   }
2637 #endif // ASSERT
2638   __ mov(c_rarg0, r15_thread);
2639   __ movl(c_rarg1, r14); // exec_mode
2640   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2641 
2642   // Need to have an oopmap that tells fetch_unroll_info where to
2643   // find any register it might need.
2644   oop_maps->add_gc_map(__ pc() - start, map);
2645 
2646   __ reset_last_Java_frame(false);
2647 
2648 #if INCLUDE_JVMCI
2649   if (EnableJVMCI) {
2650     __ bind(after_fetch_unroll_info_call);
2651   }
2652 #endif
2653 
2654   // Load UnrollBlock* into rdi
2655   __ mov(rdi, rax);
2656 
2657   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2658    Label noException;
2659   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2660   __ jcc(Assembler::notEqual, noException);
2661   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2662   // QQQ this is useless it was NULL above
2663   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2664   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2665   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2666 
2667   __ verify_oop(rax);
2668 
2669   // Overwrite the result registers with the exception results.
2670   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2671   // I think this is useless
2672   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2673 
2674   __ bind(noException);
2675 
2676   // Only register save data is on the stack.
2677   // Now restore the result registers.  Everything else is either dead
2678   // or captured in the vframeArray.
2679   RegisterSaver::restore_result_registers(masm);
2680 
2681   // All of the register save area has been popped of the stack. Only the
2682   // return address remains.
2683 
2684   // Pop all the frames we must move/replace.
2685   //
2686   // Frame picture (youngest to oldest)
2687   // 1: self-frame (no frame link)
2688   // 2: deopting frame  (no frame link)
2689   // 3: caller of deopting frame (could be compiled/interpreted).
2690   //
2691   // Note: by leaving the return address of self-frame on the stack
2692   // and using the size of frame 2 to adjust the stack
2693   // when we are done the return to frame 3 will still be on the stack.
2694 
2695   // Pop deoptimized frame
2696   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2697   __ addptr(rsp, rcx);
2698 
2699   // rsp should be pointing at the return address to the caller (3)
2700 
2701   // Pick up the initial fp we should save
2702   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2703   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2704 
2705 #ifdef ASSERT
2706   // Compilers generate code that bang the stack by as much as the
2707   // interpreter would need. So this stack banging should never
2708   // trigger a fault. Verify that it does not on non product builds.
2709   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2710   __ bang_stack_size(rbx, rcx);
2711 #endif
2712 
2713   // Load address of array of frame pcs into rcx
2714   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2715 
2716   // Trash the old pc
2717   __ addptr(rsp, wordSize);
2718 
2719   // Load address of array of frame sizes into rsi
2720   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2721 
2722   // Load counter into rdx
2723   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2724 
2725   // Now adjust the caller's stack to make up for the extra locals
2726   // but record the original sp so that we can save it in the skeletal interpreter
2727   // frame and the stack walking of interpreter_sender will get the unextended sp
2728   // value and not the "real" sp value.
2729 
2730   const Register sender_sp = r8;
2731 
2732   __ mov(sender_sp, rsp);
2733   __ movl(rbx, Address(rdi,
2734                        Deoptimization::UnrollBlock::
2735                        caller_adjustment_offset_in_bytes()));
2736   __ subptr(rsp, rbx);
2737 
2738   // Push interpreter frames in a loop
2739   Label loop;
2740   __ bind(loop);
2741   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2742   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2743   __ pushptr(Address(rcx, 0));          // Save return address
2744   __ enter();                           // Save old & set new ebp
2745   __ subptr(rsp, rbx);                  // Prolog
2746   // This value is corrected by layout_activation_impl
2747   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2748   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2749   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2750   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2751   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2752   __ decrementl(rdx);                   // Decrement counter
2753   __ jcc(Assembler::notZero, loop);
2754   __ pushptr(Address(rcx, 0));          // Save final return address
2755 
2756   // Re-push self-frame
2757   __ enter();                           // Save old & set new ebp
2758 
2759   // Allocate a full sized register save area.
2760   // Return address and rbp are in place, so we allocate two less words.
2761   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2762 
2763   // Restore frame locals after moving the frame
2764   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2765   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2766 
2767   // Call C code.  Need thread but NOT official VM entry
2768   // crud.  We cannot block on this call, no GC can happen.  Call should
2769   // restore return values to their stack-slots with the new SP.
2770   //
2771   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2772 
2773   // Use rbp because the frames look interpreted now
2774   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2775   // Don't need the precise return PC here, just precise enough to point into this code blob.
2776   address the_pc = __ pc();
2777   __ set_last_Java_frame(noreg, rbp, the_pc);
2778 
2779   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2780   __ mov(c_rarg0, r15_thread);
2781   __ movl(c_rarg1, r14); // second arg: exec_mode
2782   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2783   // Revert SP alignment after call since we're going to do some SP relative addressing below
2784   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2785 
2786   // Set an oopmap for the call site
2787   // Use the same PC we used for the last java frame
2788   oop_maps->add_gc_map(the_pc - start,
2789                        new OopMap( frame_size_in_words, 0 ));
2790 
2791   // Clear fp AND pc
2792   __ reset_last_Java_frame(true);
2793 
2794   // Collect return values
2795   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2796   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2797   // I think this is useless (throwing pc?)
2798   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2799 
2800   // Pop self-frame.
2801   __ leave();                           // Epilog
2802 
2803   // Jump to interpreter
2804   __ ret(0);
2805 
2806   // Make sure all code is generated
2807   masm->flush();
2808 
2809   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2810   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2811 #if INCLUDE_JVMCI
2812   if (EnableJVMCI) {
2813     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2814     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2815   }
2816 #endif
2817 }
2818 
2819 #ifdef COMPILER2
2820 //------------------------------generate_uncommon_trap_blob--------------------
2821 void SharedRuntime::generate_uncommon_trap_blob() {
2822   // Allocate space for the code
2823   ResourceMark rm;
2824   // Setup code generation tools
2825   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2826   MacroAssembler* masm = new MacroAssembler(&buffer);
2827 
2828   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2829 
2830   address start = __ pc();
2831 
2832   if (UseRTMLocking) {
2833     // Abort RTM transaction before possible nmethod deoptimization.
2834     __ xabort(0);
2835   }
2836 
2837   // Push self-frame.  We get here with a return address on the
2838   // stack, so rsp is 8-byte aligned until we allocate our frame.
2839   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2840 
2841   // No callee saved registers. rbp is assumed implicitly saved
2842   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2843 
2844   // compiler left unloaded_class_index in j_rarg0 move to where the
2845   // runtime expects it.
2846   __ movl(c_rarg1, j_rarg0);
2847 
2848   __ set_last_Java_frame(noreg, noreg, NULL);
2849 
2850   // Call C code.  Need thread but NOT official VM entry
2851   // crud.  We cannot block on this call, no GC can happen.  Call should
2852   // capture callee-saved registers as well as return values.
2853   // Thread is in rdi already.
2854   //
2855   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2856 
2857   __ mov(c_rarg0, r15_thread);
2858   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2859   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2860 
2861   // Set an oopmap for the call site
2862   OopMapSet* oop_maps = new OopMapSet();
2863   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2864 
2865   // location of rbp is known implicitly by the frame sender code
2866 
2867   oop_maps->add_gc_map(__ pc() - start, map);
2868 
2869   __ reset_last_Java_frame(false);
2870 
2871   // Load UnrollBlock* into rdi
2872   __ mov(rdi, rax);
2873 
2874 #ifdef ASSERT
2875   { Label L;
2876     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2877             (int32_t)Deoptimization::Unpack_uncommon_trap);
2878     __ jcc(Assembler::equal, L);
2879     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2880     __ bind(L);
2881   }
2882 #endif
2883 
2884   // Pop all the frames we must move/replace.
2885   //
2886   // Frame picture (youngest to oldest)
2887   // 1: self-frame (no frame link)
2888   // 2: deopting frame  (no frame link)
2889   // 3: caller of deopting frame (could be compiled/interpreted).
2890 
2891   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2892   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2893 
2894   // Pop deoptimized frame (int)
2895   __ movl(rcx, Address(rdi,
2896                        Deoptimization::UnrollBlock::
2897                        size_of_deoptimized_frame_offset_in_bytes()));
2898   __ addptr(rsp, rcx);
2899 
2900   // rsp should be pointing at the return address to the caller (3)
2901 
2902   // Pick up the initial fp we should save
2903   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2904   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2905 
2906 #ifdef ASSERT
2907   // Compilers generate code that bang the stack by as much as the
2908   // interpreter would need. So this stack banging should never
2909   // trigger a fault. Verify that it does not on non product builds.
2910   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2911   __ bang_stack_size(rbx, rcx);
2912 #endif
2913 
2914   // Load address of array of frame pcs into rcx (address*)
2915   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2916 
2917   // Trash the return pc
2918   __ addptr(rsp, wordSize);
2919 
2920   // Load address of array of frame sizes into rsi (intptr_t*)
2921   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2922 
2923   // Counter
2924   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2925 
2926   // Now adjust the caller's stack to make up for the extra locals but
2927   // record the original sp so that we can save it in the skeletal
2928   // interpreter frame and the stack walking of interpreter_sender
2929   // will get the unextended sp value and not the "real" sp value.
2930 
2931   const Register sender_sp = r8;
2932 
2933   __ mov(sender_sp, rsp);
2934   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2935   __ subptr(rsp, rbx);
2936 
2937   // Push interpreter frames in a loop
2938   Label loop;
2939   __ bind(loop);
2940   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2941   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2942   __ pushptr(Address(rcx, 0));     // Save return address
2943   __ enter();                      // Save old & set new rbp
2944   __ subptr(rsp, rbx);             // Prolog
2945   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2946             sender_sp);            // Make it walkable
2947   // This value is corrected by layout_activation_impl
2948   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2949   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
2950   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
2951   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
2952   __ decrementl(rdx);              // Decrement counter
2953   __ jcc(Assembler::notZero, loop);
2954   __ pushptr(Address(rcx, 0));     // Save final return address
2955 
2956   // Re-push self-frame
2957   __ enter();                 // Save old & set new rbp
2958   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2959                               // Prolog
2960 
2961   // Use rbp because the frames look interpreted now
2962   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2963   // Don't need the precise return PC here, just precise enough to point into this code blob.
2964   address the_pc = __ pc();
2965   __ set_last_Java_frame(noreg, rbp, the_pc);
2966 
2967   // Call C code.  Need thread but NOT official VM entry
2968   // crud.  We cannot block on this call, no GC can happen.  Call should
2969   // restore return values to their stack-slots with the new SP.
2970   // Thread is in rdi already.
2971   //
2972   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2973 
2974   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2975   __ mov(c_rarg0, r15_thread);
2976   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2977   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2978 
2979   // Set an oopmap for the call site
2980   // Use the same PC we used for the last java frame
2981   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2982 
2983   // Clear fp AND pc
2984   __ reset_last_Java_frame(true);
2985 
2986   // Pop self-frame.
2987   __ leave();                 // Epilog
2988 
2989   // Jump to interpreter
2990   __ ret(0);
2991 
2992   // Make sure all code is generated
2993   masm->flush();
2994 
2995   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
2996                                                  SimpleRuntimeFrame::framesize >> 1);
2997 }
2998 #endif // COMPILER2
2999 
3000 //------------------------------generate_handler_blob------
3001 //
3002 // Generate a special Compile2Runtime blob that saves all registers,
3003 // and setup oopmap.
3004 //
3005 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3006   assert(StubRoutines::forward_exception_entry() != NULL,
3007          "must be generated before");
3008 
3009   ResourceMark rm;
3010   OopMapSet *oop_maps = new OopMapSet();
3011   OopMap* map;
3012 
3013   // Allocate space for the code.  Setup code generation tools.
3014   CodeBuffer buffer("handler_blob", 2048, 1024);
3015   MacroAssembler* masm = new MacroAssembler(&buffer);
3016 
3017   address start   = __ pc();
3018   address call_pc = NULL;
3019   int frame_size_in_words;
3020   bool cause_return = (poll_type == POLL_AT_RETURN);
3021   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3022 
3023   if (UseRTMLocking) {
3024     // Abort RTM transaction before calling runtime
3025     // because critical section will be large and will be
3026     // aborted anyway. Also nmethod could be deoptimized.
3027     __ xabort(0);
3028   }
3029 
3030   // Make room for return address (or push it again)
3031   if (!cause_return) {
3032     __ push(rbx);
3033   }
3034 
3035   // Save registers, fpu state, and flags
3036   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3037 
3038   // The following is basically a call_VM.  However, we need the precise
3039   // address of the call in order to generate an oopmap. Hence, we do all the
3040   // work outselves.
3041 
3042   __ set_last_Java_frame(noreg, noreg, NULL);
3043 
3044   // The return address must always be correct so that frame constructor never
3045   // sees an invalid pc.
3046 
3047   if (!cause_return) {
3048     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3049     // Additionally, rbx is a callee saved register and we can look at it later to determine
3050     // if someone changed the return address for us!
3051     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3052     __ movptr(Address(rbp, wordSize), rbx);
3053   }
3054 
3055   // Do the call
3056   __ mov(c_rarg0, r15_thread);
3057   __ call(RuntimeAddress(call_ptr));
3058 
3059   // Set an oopmap for the call site.  This oopmap will map all
3060   // oop-registers and debug-info registers as callee-saved.  This
3061   // will allow deoptimization at this safepoint to find all possible
3062   // debug-info recordings, as well as let GC find all oops.
3063 
3064   oop_maps->add_gc_map( __ pc() - start, map);
3065 
3066   Label noException;
3067 
3068   __ reset_last_Java_frame(false);
3069 
3070   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3071   __ jcc(Assembler::equal, noException);
3072 
3073   // Exception pending
3074 
3075   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3076 
3077   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3078 
3079   // No exception case
3080   __ bind(noException);
3081 
3082   Label no_adjust;
3083 #ifdef ASSERT
3084   Label bail;
3085 #endif
3086   if (!cause_return) {
3087     Label no_prefix, not_special;
3088 
3089     // If our stashed return pc was modified by the runtime we avoid touching it
3090     __ cmpptr(rbx, Address(rbp, wordSize));
3091     __ jccb(Assembler::notEqual, no_adjust);
3092 
3093     // Skip over the poll instruction.
3094     // See NativeInstruction::is_safepoint_poll()
3095     // Possible encodings:
3096     //      85 00       test   %eax,(%rax)
3097     //      85 01       test   %eax,(%rcx)
3098     //      85 02       test   %eax,(%rdx)
3099     //      85 03       test   %eax,(%rbx)
3100     //      85 06       test   %eax,(%rsi)
3101     //      85 07       test   %eax,(%rdi)
3102     //
3103     //   41 85 00       test   %eax,(%r8)
3104     //   41 85 01       test   %eax,(%r9)
3105     //   41 85 02       test   %eax,(%r10)
3106     //   41 85 03       test   %eax,(%r11)
3107     //   41 85 06       test   %eax,(%r14)
3108     //   41 85 07       test   %eax,(%r15)
3109     //
3110     //      85 04 24    test   %eax,(%rsp)
3111     //   41 85 04 24    test   %eax,(%r12)
3112     //      85 45 00    test   %eax,0x0(%rbp)
3113     //   41 85 45 00    test   %eax,0x0(%r13)
3114 
3115     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3116     __ jcc(Assembler::notEqual, no_prefix);
3117     __ addptr(rbx, 1);
3118     __ bind(no_prefix);
3119 #ifdef ASSERT
3120     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3121 #endif
3122     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3123     // r12/rsp 0x04
3124     // r13/rbp 0x05
3125     __ movzbq(rcx, Address(rbx, 1));
3126     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3127     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3128     __ cmpptr(rcx, 1);
3129     __ jcc(Assembler::above, not_special);
3130     __ addptr(rbx, 1);
3131     __ bind(not_special);
3132 #ifdef ASSERT
3133     // Verify the correct encoding of the poll we're about to skip.
3134     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3135     __ jcc(Assembler::notEqual, bail);
3136     // Mask out the modrm bits
3137     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3138     // rax encodes to 0, so if the bits are nonzero it's incorrect
3139     __ jcc(Assembler::notZero, bail);
3140 #endif
3141     // Adjust return pc forward to step over the safepoint poll instruction
3142     __ addptr(rbx, 2);
3143     __ movptr(Address(rbp, wordSize), rbx);
3144   }
3145 
3146   __ bind(no_adjust);
3147   // Normal exit, restore registers and exit.
3148   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3149   __ ret(0);
3150 
3151 #ifdef ASSERT
3152   __ bind(bail);
3153   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3154 #endif
3155 
3156   // Make sure all code is generated
3157   masm->flush();
3158 
3159   // Fill-out other meta info
3160   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3161 }
3162 
3163 //
3164 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3165 //
3166 // Generate a stub that calls into vm to find out the proper destination
3167 // of a java call. All the argument registers are live at this point
3168 // but since this is generic code we don't know what they are and the caller
3169 // must do any gc of the args.
3170 //
3171 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3172   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3173 
3174   // allocate space for the code
3175   ResourceMark rm;
3176 
3177   CodeBuffer buffer(name, 1000, 512);
3178   MacroAssembler* masm                = new MacroAssembler(&buffer);
3179 
3180   int frame_size_in_words;
3181 
3182   OopMapSet *oop_maps = new OopMapSet();
3183   OopMap* map = NULL;
3184 
3185   int start = __ offset();
3186 
3187   // No need to save vector registers since they are caller-saved anyway.
3188   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3189 
3190   int frame_complete = __ offset();
3191 
3192   __ set_last_Java_frame(noreg, noreg, NULL);
3193 
3194   __ mov(c_rarg0, r15_thread);
3195 
3196   __ call(RuntimeAddress(destination));
3197 
3198 
3199   // Set an oopmap for the call site.
3200   // We need this not only for callee-saved registers, but also for volatile
3201   // registers that the compiler might be keeping live across a safepoint.
3202 
3203   oop_maps->add_gc_map( __ offset() - start, map);
3204 
3205   // rax contains the address we are going to jump to assuming no exception got installed
3206 
3207   // clear last_Java_sp
3208   __ reset_last_Java_frame(false);
3209   // check for pending exceptions
3210   Label pending;
3211   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3212   __ jcc(Assembler::notEqual, pending);
3213 
3214   // get the returned Method*
3215   __ get_vm_result_2(rbx, r15_thread);
3216   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3217 
3218   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3219 
3220   RegisterSaver::restore_live_registers(masm);
3221 
3222   // We are back the the original state on entry and ready to go.
3223 
3224   __ jmp(rax);
3225 
3226   // Pending exception after the safepoint
3227 
3228   __ bind(pending);
3229 
3230   RegisterSaver::restore_live_registers(masm);
3231 
3232   // exception pending => remove activation and forward to exception handler
3233 
3234   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3235 
3236   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3237   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3238 
3239   // -------------
3240   // make sure all code is generated
3241   masm->flush();
3242 
3243   // return the  blob
3244   // frame_size_words or bytes??
3245   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3246 }
3247 
3248 #ifdef COMPILER2
3249 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3250 
3251 class NativeInvokerGenerator : public StubCodeGenerator {
3252   address _call_target;
3253   int _shadow_space_bytes;
3254 
3255   const GrowableArray<VMReg>& _input_registers;
3256   const GrowableArray<VMReg>& _output_registers;
3257 
3258   int _frame_complete;
3259   int _framesize;
3260   OopMapSet* _oop_maps;
3261 public:
3262   NativeInvokerGenerator(CodeBuffer* buffer,
3263                          address call_target,
3264                          int shadow_space_bytes,
3265                          const GrowableArray<VMReg>& input_registers,
3266                          const GrowableArray<VMReg>& output_registers)
3267    : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3268      _call_target(call_target),
3269      _shadow_space_bytes(shadow_space_bytes),
3270      _input_registers(input_registers),
3271      _output_registers(output_registers),
3272      _frame_complete(0),
3273      _framesize(0),
3274      _oop_maps(NULL) {
3275     assert(_output_registers.length() <= 1
3276            || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3277 
3278   }
3279 
3280   void generate();
3281 
3282   int spill_size_in_bytes() const {
3283     if (_output_registers.length() == 0) {
3284       return 0;
3285     }
3286     VMReg reg = _output_registers.at(0);
3287     assert(reg->is_reg(), "must be a register");
3288     if (reg->is_Register()) {
3289       return 8;
3290     } else if (reg->is_XMMRegister()) {
3291       if (UseAVX >= 3) {
3292         return 64;
3293       } else if (UseAVX >= 1) {
3294         return 32;
3295       } else {
3296         return 16;
3297       }
3298     } else {
3299       ShouldNotReachHere();
3300     }
3301     return 0;
3302   }
3303 
3304   void spill_out_registers() {
3305     if (_output_registers.length() == 0) {
3306       return;
3307     }
3308     VMReg reg = _output_registers.at(0);
3309     assert(reg->is_reg(), "must be a register");
3310     MacroAssembler* masm = _masm;
3311     if (reg->is_Register()) {
3312       __ movptr(Address(rsp, 0), reg->as_Register());
3313     } else if (reg->is_XMMRegister()) {
3314       if (UseAVX >= 3) {
3315         __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3316       } else if (UseAVX >= 1) {
3317         __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3318       } else {
3319         __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3320       }
3321     } else {
3322       ShouldNotReachHere();
3323     }
3324   }
3325 
3326   void fill_out_registers() {
3327     if (_output_registers.length() == 0) {
3328       return;
3329     }
3330     VMReg reg = _output_registers.at(0);
3331     assert(reg->is_reg(), "must be a register");
3332     MacroAssembler* masm = _masm;
3333     if (reg->is_Register()) {
3334       __ movptr(reg->as_Register(), Address(rsp, 0));
3335     } else if (reg->is_XMMRegister()) {
3336       if (UseAVX >= 3) {
3337         __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3338       } else if (UseAVX >= 1) {
3339         __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3340       } else {
3341         __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3342       }
3343     } else {
3344       ShouldNotReachHere();
3345     }
3346   }
3347 
3348   int frame_complete() const {
3349     return _frame_complete;
3350   }
3351 
3352   int framesize() const {
3353     return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3354   }
3355 
3356   OopMapSet* oop_maps() const {
3357     return _oop_maps;
3358   }
3359 
3360 private:
3361 #ifdef ASSERT
3362 bool target_uses_register(VMReg reg) {
3363   return _input_registers.contains(reg) || _output_registers.contains(reg);
3364 }
3365 #endif
3366 };
3367 
3368 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3369                                                 int shadow_space_bytes,
3370                                                 const GrowableArray<VMReg>& input_registers,
3371                                                 const GrowableArray<VMReg>& output_registers) {
3372   int locs_size  = 64;
3373   CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3374   NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3375   g.generate();
3376   code.log_section_sizes("nep_invoker_blob");
3377 
3378   RuntimeStub* stub =
3379     RuntimeStub::new_runtime_stub("nep_invoker_blob",
3380                                   &code,
3381                                   g.frame_complete(),
3382                                   g.framesize(),
3383                                   g.oop_maps(), false);
3384   return stub;
3385 }
3386 
3387 void NativeInvokerGenerator::generate() {
3388   assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3389 
3390   enum layout {
3391     rbp_off,
3392     rbp_off2,
3393     return_off,
3394     return_off2,
3395     framesize // inclusive of return address
3396   };
3397 
3398   _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3399   assert(is_even(_framesize/2), "sp not 16-byte aligned");
3400 
3401   _oop_maps  = new OopMapSet();
3402   MacroAssembler* masm = _masm;
3403 
3404   address start = __ pc();
3405 
3406   __ enter();
3407 
3408   // return address and rbp are already in place
3409   __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3410 
3411   _frame_complete = __ pc() - start;
3412 
3413   address the_pc = __ pc();
3414 
3415   __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3416   OopMap* map = new OopMap(_framesize, 0);
3417   _oop_maps->add_gc_map(the_pc - start, map);
3418 
3419   // State transition
3420   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3421 
3422   __ call(RuntimeAddress(_call_target));
3423 
3424   __ restore_cpu_control_state_after_jni();
3425 
3426   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3427 
3428   // Force this write out before the read below
3429   __ membar(Assembler::Membar_mask_bits(
3430           Assembler::LoadLoad | Assembler::LoadStore |
3431           Assembler::StoreLoad | Assembler::StoreStore));
3432 
3433   Label L_after_safepoint_poll;
3434   Label L_safepoint_poll_slow_path;
3435 
3436   __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3437   __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3438   __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3439 
3440   __ bind(L_after_safepoint_poll);
3441 
3442   // change thread state
3443   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3444 
3445   __ block_comment("reguard stack check");
3446   Label L_reguard;
3447   Label L_after_reguard;
3448   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3449   __ jcc(Assembler::equal, L_reguard);
3450   __ bind(L_after_reguard);
3451 
3452   __ reset_last_Java_frame(r15_thread, true);
3453 
3454   __ leave(); // required for proper stackwalking of RuntimeStub frame
3455   __ ret(0);
3456 
3457   //////////////////////////////////////////////////////////////////////////////
3458 
3459   __ block_comment("{ L_safepoint_poll_slow_path");
3460   __ bind(L_safepoint_poll_slow_path);
3461   __ vzeroupper();
3462 
3463   spill_out_registers();
3464 
3465   __ mov(c_rarg0, r15_thread);
3466   __ mov(r12, rsp); // remember sp
3467   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3468   __ andptr(rsp, -16); // align stack as required by ABI
3469   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3470   __ mov(rsp, r12); // restore sp
3471   __ reinit_heapbase();
3472 
3473   fill_out_registers();
3474 
3475   __ jmp(L_after_safepoint_poll);
3476   __ block_comment("} L_safepoint_poll_slow_path");
3477 
3478   //////////////////////////////////////////////////////////////////////////////
3479 
3480   __ block_comment("{ L_reguard");
3481   __ bind(L_reguard);
3482   __ vzeroupper();
3483 
3484   spill_out_registers();
3485 
3486   __ mov(r12, rsp); // remember sp
3487   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3488   __ andptr(rsp, -16); // align stack as required by ABI
3489   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3490   __ mov(rsp, r12); // restore sp
3491   __ reinit_heapbase();
3492 
3493   fill_out_registers();
3494 
3495   __ jmp(L_after_reguard);
3496 
3497   __ block_comment("} L_reguard");
3498 
3499   //////////////////////////////////////////////////////////////////////////////
3500 
3501   __ flush();
3502 }
3503 #endif // COMPILER2
3504 
3505 //------------------------------Montgomery multiplication------------------------
3506 //
3507 
3508 #ifndef _WINDOWS
3509 
3510 // Subtract 0:b from carry:a.  Return carry.
3511 static julong
3512 sub(julong a[], julong b[], julong carry, long len) {
3513   long long i = 0, cnt = len;
3514   julong tmp;
3515   asm volatile("clc; "
3516                "0: ; "
3517                "mov (%[b], %[i], 8), %[tmp]; "
3518                "sbb %[tmp], (%[a], %[i], 8); "
3519                "inc %[i]; dec %[cnt]; "
3520                "jne 0b; "
3521                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3522                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3523                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3524                : "memory");
3525   return tmp;
3526 }
3527 
3528 // Multiply (unsigned) Long A by Long B, accumulating the double-
3529 // length result into the accumulator formed of T0, T1, and T2.
3530 #define MACC(A, B, T0, T1, T2)                                  \
3531 do {                                                            \
3532   unsigned long hi, lo;                                         \
3533   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3534            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3535            : "r"(A), "a"(B) : "cc");                            \
3536  } while(0)
3537 
3538 // As above, but add twice the double-length result into the
3539 // accumulator.
3540 #define MACC2(A, B, T0, T1, T2)                                 \
3541 do {                                                            \
3542   unsigned long hi, lo;                                         \
3543   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3544            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3545            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3546            : "r"(A), "a"(B) : "cc");                            \
3547  } while(0)
3548 
3549 #else //_WINDOWS
3550 
3551 static julong
3552 sub(julong a[], julong b[], julong carry, long len) {
3553   long i;
3554   julong tmp;
3555   unsigned char c = 1;
3556   for (i = 0; i < len; i++) {
3557     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3558     a[i] = tmp;
3559   }
3560   c = _addcarry_u64(c, carry, ~0, &tmp);
3561   return tmp;
3562 }
3563 
3564 // Multiply (unsigned) Long A by Long B, accumulating the double-
3565 // length result into the accumulator formed of T0, T1, and T2.
3566 #define MACC(A, B, T0, T1, T2)                          \
3567 do {                                                    \
3568   julong hi, lo;                            \
3569   lo = _umul128(A, B, &hi);                             \
3570   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3571   c = _addcarry_u64(c, hi, T1, &T1);                    \
3572   _addcarry_u64(c, T2, 0, &T2);                         \
3573  } while(0)
3574 
3575 // As above, but add twice the double-length result into the
3576 // accumulator.
3577 #define MACC2(A, B, T0, T1, T2)                         \
3578 do {                                                    \
3579   julong hi, lo;                            \
3580   lo = _umul128(A, B, &hi);                             \
3581   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3582   c = _addcarry_u64(c, hi, T1, &T1);                    \
3583   _addcarry_u64(c, T2, 0, &T2);                         \
3584   c = _addcarry_u64(0, lo, T0, &T0);                    \
3585   c = _addcarry_u64(c, hi, T1, &T1);                    \
3586   _addcarry_u64(c, T2, 0, &T2);                         \
3587  } while(0)
3588 
3589 #endif //_WINDOWS
3590 
3591 // Fast Montgomery multiplication.  The derivation of the algorithm is
3592 // in  A Cryptographic Library for the Motorola DSP56000,
3593 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3594 
3595 static void NOINLINE
3596 montgomery_multiply(julong a[], julong b[], julong n[],
3597                     julong m[], julong inv, int len) {
3598   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3599   int i;
3600 
3601   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3602 
3603   for (i = 0; i < len; i++) {
3604     int j;
3605     for (j = 0; j < i; j++) {
3606       MACC(a[j], b[i-j], t0, t1, t2);
3607       MACC(m[j], n[i-j], t0, t1, t2);
3608     }
3609     MACC(a[i], b[0], t0, t1, t2);
3610     m[i] = t0 * inv;
3611     MACC(m[i], n[0], t0, t1, t2);
3612 
3613     assert(t0 == 0, "broken Montgomery multiply");
3614 
3615     t0 = t1; t1 = t2; t2 = 0;
3616   }
3617 
3618   for (i = len; i < 2*len; i++) {
3619     int j;
3620     for (j = i-len+1; j < len; j++) {
3621       MACC(a[j], b[i-j], t0, t1, t2);
3622       MACC(m[j], n[i-j], t0, t1, t2);
3623     }
3624     m[i-len] = t0;
3625     t0 = t1; t1 = t2; t2 = 0;
3626   }
3627 
3628   while (t0)
3629     t0 = sub(m, n, t0, len);
3630 }
3631 
3632 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3633 // multiplies so it should be up to 25% faster than Montgomery
3634 // multiplication.  However, its loop control is more complex and it
3635 // may actually run slower on some machines.
3636 
3637 static void NOINLINE
3638 montgomery_square(julong a[], julong n[],
3639                   julong m[], julong inv, int len) {
3640   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3641   int i;
3642 
3643   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3644 
3645   for (i = 0; i < len; i++) {
3646     int j;
3647     int end = (i+1)/2;
3648     for (j = 0; j < end; j++) {
3649       MACC2(a[j], a[i-j], t0, t1, t2);
3650       MACC(m[j], n[i-j], t0, t1, t2);
3651     }
3652     if ((i & 1) == 0) {
3653       MACC(a[j], a[j], t0, t1, t2);
3654     }
3655     for (; j < i; j++) {
3656       MACC(m[j], n[i-j], t0, t1, t2);
3657     }
3658     m[i] = t0 * inv;
3659     MACC(m[i], n[0], t0, t1, t2);
3660 
3661     assert(t0 == 0, "broken Montgomery square");
3662 
3663     t0 = t1; t1 = t2; t2 = 0;
3664   }
3665 
3666   for (i = len; i < 2*len; i++) {
3667     int start = i-len+1;
3668     int end = start + (len - start)/2;
3669     int j;
3670     for (j = start; j < end; j++) {
3671       MACC2(a[j], a[i-j], t0, t1, t2);
3672       MACC(m[j], n[i-j], t0, t1, t2);
3673     }
3674     if ((i & 1) == 0) {
3675       MACC(a[j], a[j], t0, t1, t2);
3676     }
3677     for (; j < len; j++) {
3678       MACC(m[j], n[i-j], t0, t1, t2);
3679     }
3680     m[i-len] = t0;
3681     t0 = t1; t1 = t2; t2 = 0;
3682   }
3683 
3684   while (t0)
3685     t0 = sub(m, n, t0, len);
3686 }
3687 
3688 // Swap words in a longword.
3689 static julong swap(julong x) {
3690   return (x << 32) | (x >> 32);
3691 }
3692 
3693 // Copy len longwords from s to d, word-swapping as we go.  The
3694 // destination array is reversed.
3695 static void reverse_words(julong *s, julong *d, int len) {
3696   d += len;
3697   while(len-- > 0) {
3698     d--;
3699     *d = swap(*s);
3700     s++;
3701   }
3702 }
3703 
3704 // The threshold at which squaring is advantageous was determined
3705 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3706 #define MONTGOMERY_SQUARING_THRESHOLD 64
3707 
3708 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3709                                         jint len, jlong inv,
3710                                         jint *m_ints) {
3711   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3712   int longwords = len/2;
3713 
3714   // Make very sure we don't use so much space that the stack might
3715   // overflow.  512 jints corresponds to an 16384-bit integer and
3716   // will use here a total of 8k bytes of stack space.
3717   int total_allocation = longwords * sizeof (julong) * 4;
3718   guarantee(total_allocation <= 8192, "must be");
3719   julong *scratch = (julong *)alloca(total_allocation);
3720 
3721   // Local scratch arrays
3722   julong
3723     *a = scratch + 0 * longwords,
3724     *b = scratch + 1 * longwords,
3725     *n = scratch + 2 * longwords,
3726     *m = scratch + 3 * longwords;
3727 
3728   reverse_words((julong *)a_ints, a, longwords);
3729   reverse_words((julong *)b_ints, b, longwords);
3730   reverse_words((julong *)n_ints, n, longwords);
3731 
3732   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3733 
3734   reverse_words(m, (julong *)m_ints, longwords);
3735 }
3736 
3737 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3738                                       jint len, jlong inv,
3739                                       jint *m_ints) {
3740   assert(len % 2 == 0, "array length in montgomery_square must be even");
3741   int longwords = len/2;
3742 
3743   // Make very sure we don't use so much space that the stack might
3744   // overflow.  512 jints corresponds to an 16384-bit integer and
3745   // will use here a total of 6k bytes of stack space.
3746   int total_allocation = longwords * sizeof (julong) * 3;
3747   guarantee(total_allocation <= 8192, "must be");
3748   julong *scratch = (julong *)alloca(total_allocation);
3749 
3750   // Local scratch arrays
3751   julong
3752     *a = scratch + 0 * longwords,
3753     *n = scratch + 1 * longwords,
3754     *m = scratch + 2 * longwords;
3755 
3756   reverse_words((julong *)a_ints, a, longwords);
3757   reverse_words((julong *)n_ints, n, longwords);
3758 
3759   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3760     ::montgomery_square(a, n, m, (julong)inv, longwords);
3761   } else {
3762     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3763   }
3764 
3765   reverse_words(m, (julong *)m_ints, longwords);
3766 }
3767 
3768 #ifdef COMPILER2
3769 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3770 //
3771 //------------------------------generate_exception_blob---------------------------
3772 // creates exception blob at the end
3773 // Using exception blob, this code is jumped from a compiled method.
3774 // (see emit_exception_handler in x86_64.ad file)
3775 //
3776 // Given an exception pc at a call we call into the runtime for the
3777 // handler in this method. This handler might merely restore state
3778 // (i.e. callee save registers) unwind the frame and jump to the
3779 // exception handler for the nmethod if there is no Java level handler
3780 // for the nmethod.
3781 //
3782 // This code is entered with a jmp.
3783 //
3784 // Arguments:
3785 //   rax: exception oop
3786 //   rdx: exception pc
3787 //
3788 // Results:
3789 //   rax: exception oop
3790 //   rdx: exception pc in caller or ???
3791 //   destination: exception handler of caller
3792 //
3793 // Note: the exception pc MUST be at a call (precise debug information)
3794 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3795 //
3796 
3797 void OptoRuntime::generate_exception_blob() {
3798   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3799   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3800   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3801 
3802   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3803 
3804   // Allocate space for the code
3805   ResourceMark rm;
3806   // Setup code generation tools
3807   CodeBuffer buffer("exception_blob", 2048, 1024);
3808   MacroAssembler* masm = new MacroAssembler(&buffer);
3809 
3810 
3811   address start = __ pc();
3812 
3813   // Exception pc is 'return address' for stack walker
3814   __ push(rdx);
3815   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3816 
3817   // Save callee-saved registers.  See x86_64.ad.
3818 
3819   // rbp is an implicitly saved callee saved register (i.e., the calling
3820   // convention will save/restore it in the prolog/epilog). Other than that
3821   // there are no callee save registers now that adapter frames are gone.
3822 
3823   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3824 
3825   // Store exception in Thread object. We cannot pass any arguments to the
3826   // handle_exception call, since we do not want to make any assumption
3827   // about the size of the frame where the exception happened in.
3828   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3829   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3830   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3831 
3832   // This call does all the hard work.  It checks if an exception handler
3833   // exists in the method.
3834   // If so, it returns the handler address.
3835   // If not, it prepares for stack-unwinding, restoring the callee-save
3836   // registers of the frame being removed.
3837   //
3838   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3839 
3840   // At a method handle call, the stack may not be properly aligned
3841   // when returning with an exception.
3842   address the_pc = __ pc();
3843   __ set_last_Java_frame(noreg, noreg, the_pc);
3844   __ mov(c_rarg0, r15_thread);
3845   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3846   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3847 
3848   // Set an oopmap for the call site.  This oopmap will only be used if we
3849   // are unwinding the stack.  Hence, all locations will be dead.
3850   // Callee-saved registers will be the same as the frame above (i.e.,
3851   // handle_exception_stub), since they were restored when we got the
3852   // exception.
3853 
3854   OopMapSet* oop_maps = new OopMapSet();
3855 
3856   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3857 
3858   __ reset_last_Java_frame(false);
3859 
3860   // Restore callee-saved registers
3861 
3862   // rbp is an implicitly saved callee-saved register (i.e., the calling
3863   // convention will save restore it in prolog/epilog) Other than that
3864   // there are no callee save registers now that adapter frames are gone.
3865 
3866   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3867 
3868   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3869   __ pop(rdx);                  // No need for exception pc anymore
3870 
3871   // rax: exception handler
3872 
3873   // We have a handler in rax (could be deopt blob).
3874   __ mov(r8, rax);
3875 
3876   // Get the exception oop
3877   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3878   // Get the exception pc in case we are deoptimized
3879   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3880 #ifdef ASSERT
3881   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3882   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3883 #endif
3884   // Clear the exception oop so GC no longer processes it as a root.
3885   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3886 
3887   // rax: exception oop
3888   // r8:  exception handler
3889   // rdx: exception pc
3890   // Jump to handler
3891 
3892   __ jmp(r8);
3893 
3894   // Make sure all code is generated
3895   masm->flush();
3896 
3897   // Set exception blob
3898   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3899 }
3900 #endif // COMPILER2
3901 
3902 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
3903                                        int total_in_args, const VMRegPair* in_regs,
3904                                        int total_out_args, VMRegPair* out_regs,
3905                                        GrowableArray<int>& arg_order,
3906                                        VMRegPair tmp_vmreg) {
3907   ComputeMoveOrder order(total_in_args, in_regs,
3908                          total_out_args, out_regs,
3909                          in_sig_bt, arg_order, tmp_vmreg);
3910 }