1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/icBuffer.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/compiledICHolder.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/jniHandles.hpp"
  48 #include "runtime/safepointMechanism.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/signature.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "runtime/vframeArray.hpp"
  53 #include "runtime/vm_version.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/formatBuffer.hpp"
  56 #include "vmreg_x86.inline.hpp"
  57 #ifdef COMPILER1
  58 #include "c1/c1_Runtime1.hpp"
  59 #endif
  60 #ifdef COMPILER2
  61 #include "opto/runtime.hpp"
  62 #endif
  63 #if INCLUDE_JVMCI
  64 #include "jvmci/jvmciJavaClasses.hpp"
  65 #endif
  66 
  67 #define __ masm->
  68 
  69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  70 
  71 class SimpleRuntimeFrame {
  72 
  73   public:
  74 
  75   // Most of the runtime stubs have this simple frame layout.
  76   // This class exists to make the layout shared in one place.
  77   // Offsets are for compiler stack slots, which are jints.
  78   enum layout {
  79     // The frame sender code expects that rbp will be in the "natural" place and
  80     // will override any oopMap setting for it. We must therefore force the layout
  81     // so that it agrees with the frame sender code.
  82     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  83     rbp_off2,
  84     return_off, return_off2,
  85     framesize
  86   };
  87 };
  88 
  89 class RegisterSaver {
  90   // Capture info about frame layout.  Layout offsets are in jint
  91   // units because compiler frame slots are jints.
  92 #define XSAVE_AREA_BEGIN 160
  93 #define XSAVE_AREA_YMM_BEGIN 576
  94 #define XSAVE_AREA_OPMASK_BEGIN 1088
  95 #define XSAVE_AREA_ZMM_BEGIN 1152
  96 #define XSAVE_AREA_UPPERBANK 1664
  97 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  98 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  99 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 100 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 102   enum layout {
 103     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 104     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 105     DEF_XMM_OFFS(0),
 106     DEF_XMM_OFFS(1),
 107     // 2..15 are implied in range usage
 108     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 109     DEF_YMM_OFFS(0),
 110     DEF_YMM_OFFS(1),
 111     // 2..15 are implied in range usage
 112     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 113     DEF_OPMASK_OFFS(0),
 114     DEF_OPMASK_OFFS(1),
 115     // 2..7 are implied in range usage
 116     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 117     DEF_ZMM_OFFS(0),
 118     DEF_ZMM_OFFS(1),
 119     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 120     DEF_ZMM_UPPER_OFFS(16),
 121     DEF_ZMM_UPPER_OFFS(17),
 122     // 18..31 are implied in range usage
 123     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 124     fpu_stateH_end,
 125     r15_off, r15H_off,
 126     r14_off, r14H_off,
 127     r13_off, r13H_off,
 128     r12_off, r12H_off,
 129     r11_off, r11H_off,
 130     r10_off, r10H_off,
 131     r9_off,  r9H_off,
 132     r8_off,  r8H_off,
 133     rdi_off, rdiH_off,
 134     rsi_off, rsiH_off,
 135     ignore_off, ignoreH_off,  // extra copy of rbp
 136     rsp_off, rspH_off,
 137     rbx_off, rbxH_off,
 138     rdx_off, rdxH_off,
 139     rcx_off, rcxH_off,
 140     rax_off, raxH_off,
 141     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 142     align_off, alignH_off,
 143     flags_off, flagsH_off,
 144     // The frame sender code expects that rbp will be in the "natural" place and
 145     // will override any oopMap setting for it. We must therefore force the layout
 146     // so that it agrees with the frame sender code.
 147     rbp_off, rbpH_off,        // copy of rbp we will restore
 148     return_off, returnH_off,  // slot for return address
 149     reg_save_size             // size in compiler stack slots
 150   };
 151 
 152  public:
 153   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 154   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 155 
 156   // Offsets into the register save area
 157   // Used by deoptimization when it is managing result register
 158   // values on its own
 159 
 160   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 161   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 162   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 163   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 164   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 165 
 166   // During deoptimization only the result registers need to be restored,
 167   // all the other values have already been extracted.
 168   static void restore_result_registers(MacroAssembler* masm);
 169 };
 170 
 171 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 172   int off = 0;
 173   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 174   if (UseAVX < 3) {
 175     num_xmm_regs = num_xmm_regs/2;
 176   }
 177 #if COMPILER2_OR_JVMCI
 178   if (save_wide_vectors && UseAVX == 0) {
 179     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 180   }
 181   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 182 #else
 183   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 184 #endif
 185 
 186   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 187   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 188   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 189   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 190   // CodeBlob frame size is in words.
 191   int frame_size_in_words = frame_size_in_bytes / wordSize;
 192   *total_frame_words = frame_size_in_words;
 193 
 194   // Save registers, fpu state, and flags.
 195   // We assume caller has already pushed the return address onto the
 196   // stack, so rsp is 8-byte aligned here.
 197   // We push rpb twice in this sequence because we want the real rbp
 198   // to be under the return like a normal enter.
 199 
 200   __ enter();          // rsp becomes 16-byte aligned here
 201   __ push_CPU_state(); // Push a multiple of 16 bytes
 202 
 203   // push cpu state handles this on EVEX enabled targets
 204   if (save_wide_vectors) {
 205     // Save upper half of YMM registers(0..15)
 206     int base_addr = XSAVE_AREA_YMM_BEGIN;
 207     for (int n = 0; n < 16; n++) {
 208       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 209     }
 210     if (VM_Version::supports_evex()) {
 211       // Save upper half of ZMM registers(0..15)
 212       base_addr = XSAVE_AREA_ZMM_BEGIN;
 213       for (int n = 0; n < 16; n++) {
 214         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 215       }
 216       // Save full ZMM registers(16..num_xmm_regs)
 217       base_addr = XSAVE_AREA_UPPERBANK;
 218       off = 0;
 219       int vector_len = Assembler::AVX_512bit;
 220       for (int n = 16; n < num_xmm_regs; n++) {
 221         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 222       }
 223 #if COMPILER2_OR_JVMCI
 224       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 225       off = 0;
 226       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 227         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 228       }
 229 #endif
 230     }
 231   } else {
 232     if (VM_Version::supports_evex()) {
 233       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 234       int base_addr = XSAVE_AREA_UPPERBANK;
 235       off = 0;
 236       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 237       for (int n = 16; n < num_xmm_regs; n++) {
 238         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 239       }
 240 #if COMPILER2_OR_JVMCI
 241       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 242       off = 0;
 243       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 244         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 245       }
 246 #endif
 247     }
 248   }
 249   __ vzeroupper();
 250   if (frame::arg_reg_save_area_bytes != 0) {
 251     // Allocate argument register save area
 252     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 253   }
 254 
 255   // Set an oopmap for the call site.  This oopmap will map all
 256   // oop-registers and debug-info registers as callee-saved.  This
 257   // will allow deoptimization at this safepoint to find all possible
 258   // debug-info recordings, as well as let GC find all oops.
 259 
 260   OopMapSet *oop_maps = new OopMapSet();
 261   OopMap* map = new OopMap(frame_size_in_slots, 0);
 262 
 263 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 264 
 265   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 266   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 267   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 269   // rbp location is known implicitly by the frame sender code, needs no oopmap
 270   // and the location where rbp was saved by is ignored
 271   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 272   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 273   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 281   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 282   // on EVEX enabled targets, we get it included in the xsave area
 283   off = xmm0_off;
 284   int delta = xmm1_off - off;
 285   for (int n = 0; n < 16; n++) {
 286     XMMRegister xmm_name = as_XMMRegister(n);
 287     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 288     off += delta;
 289   }
 290   if (UseAVX > 2) {
 291     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 292     off = zmm16_off;
 293     delta = zmm17_off - off;
 294     for (int n = 16; n < num_xmm_regs; n++) {
 295       XMMRegister zmm_name = as_XMMRegister(n);
 296       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 297       off += delta;
 298     }
 299   }
 300 
 301 #if COMPILER2_OR_JVMCI
 302   if (save_wide_vectors) {
 303     // Save upper half of YMM registers(0..15)
 304     off = ymm0_off;
 305     delta = ymm1_off - ymm0_off;
 306     for (int n = 0; n < 16; n++) {
 307       XMMRegister ymm_name = as_XMMRegister(n);
 308       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 309       off += delta;
 310     }
 311     if (VM_Version::supports_evex()) {
 312       // Save upper half of ZMM registers(0..15)
 313       off = zmm0_off;
 314       delta = zmm1_off - zmm0_off;
 315       for (int n = 0; n < 16; n++) {
 316         XMMRegister zmm_name = as_XMMRegister(n);
 317         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 318         off += delta;
 319       }
 320     }
 321   }
 322 #endif // COMPILER2_OR_JVMCI
 323 
 324   // %%% These should all be a waste but we'll keep things as they were for now
 325   if (true) {
 326     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 327     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 328     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 330     // rbp location is known implicitly by the frame sender code, needs no oopmap
 331     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 341     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 342     // on EVEX enabled targets, we get it included in the xsave area
 343     off = xmm0H_off;
 344     delta = xmm1H_off - off;
 345     for (int n = 0; n < 16; n++) {
 346       XMMRegister xmm_name = as_XMMRegister(n);
 347       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 348       off += delta;
 349     }
 350     if (UseAVX > 2) {
 351       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 352       off = zmm16H_off;
 353       delta = zmm17H_off - off;
 354       for (int n = 16; n < num_xmm_regs; n++) {
 355         XMMRegister zmm_name = as_XMMRegister(n);
 356         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 357         off += delta;
 358       }
 359     }
 360   }
 361 
 362   return map;
 363 }
 364 
 365 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 366   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 367   if (UseAVX < 3) {
 368     num_xmm_regs = num_xmm_regs/2;
 369   }
 370   if (frame::arg_reg_save_area_bytes != 0) {
 371     // Pop arg register save area
 372     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 373   }
 374 
 375 #if COMPILER2_OR_JVMCI
 376   if (restore_wide_vectors) {
 377     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 378     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 379   }
 380 #else
 381   assert(!restore_wide_vectors, "vectors are generated only by C2");
 382 #endif
 383 
 384   __ vzeroupper();
 385 
 386   // On EVEX enabled targets everything is handled in pop fpu state
 387   if (restore_wide_vectors) {
 388     // Restore upper half of YMM registers (0..15)
 389     int base_addr = XSAVE_AREA_YMM_BEGIN;
 390     for (int n = 0; n < 16; n++) {
 391       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 392     }
 393     if (VM_Version::supports_evex()) {
 394       // Restore upper half of ZMM registers (0..15)
 395       base_addr = XSAVE_AREA_ZMM_BEGIN;
 396       for (int n = 0; n < 16; n++) {
 397         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 398       }
 399       // Restore full ZMM registers(16..num_xmm_regs)
 400       base_addr = XSAVE_AREA_UPPERBANK;
 401       int vector_len = Assembler::AVX_512bit;
 402       int off = 0;
 403       for (int n = 16; n < num_xmm_regs; n++) {
 404         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 405       }
 406 #if COMPILER2_OR_JVMCI
 407       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 408       off = 0;
 409       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 410         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 411       }
 412 #endif
 413     }
 414   } else {
 415     if (VM_Version::supports_evex()) {
 416       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 417       int base_addr = XSAVE_AREA_UPPERBANK;
 418       int off = 0;
 419       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 420       for (int n = 16; n < num_xmm_regs; n++) {
 421         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 422       }
 423 #if COMPILER2_OR_JVMCI
 424       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 425       off = 0;
 426       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 427         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 428       }
 429 #endif
 430     }
 431   }
 432 
 433   // Recover CPU state
 434   __ pop_CPU_state();
 435   // Get the rbp described implicitly by the calling convention (no oopMap)
 436   __ pop(rbp);
 437 }
 438 
 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 440 
 441   // Just restore result register. Only used by deoptimization. By
 442   // now any callee save register that needs to be restored to a c2
 443   // caller of the deoptee has been extracted into the vframeArray
 444   // and will be stuffed into the c2i adapter we create for later
 445   // restoration so only result registers need to be restored here.
 446 
 447   // Restore fp result register
 448   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 449   // Restore integer result register
 450   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 451   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 452 
 453   // Pop all of the register save are off the stack except the return address
 454   __ addptr(rsp, return_offset_in_bytes());
 455 }
 456 
 457 // Is vector's size (in bytes) bigger than a size saved by default?
 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 459 bool SharedRuntime::is_wide_vector(int size) {
 460   return size > 16;
 461 }
 462 
 463 // ---------------------------------------------------------------------------
 464 // Read the array of BasicTypes from a signature, and compute where the
 465 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 466 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 467 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 468 // as framesizes are fixed.
 469 // VMRegImpl::stack0 refers to the first slot 0(sp).
 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 471 // up to RegisterImpl::number_of_registers) are the 64-bit
 472 // integer registers.
 473 
 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 475 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 476 // units regardless of build. Of course for i486 there is no 64 bit build
 477 
 478 // The Java calling convention is a "shifted" version of the C ABI.
 479 // By skipping the first C ABI register we can call non-static jni methods
 480 // with small numbers of arguments without having to shuffle the arguments
 481 // at all. Since we control the java ABI we ought to at least get some
 482 // advantage out of it.
 483 
 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 485                                            VMRegPair *regs,
 486                                            int total_args_passed) {
 487 
 488   // Create the mapping between argument positions and
 489   // registers.
 490   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 491     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 492   };
 493   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 494     j_farg0, j_farg1, j_farg2, j_farg3,
 495     j_farg4, j_farg5, j_farg6, j_farg7
 496   };
 497 
 498 
 499   uint int_args = 0;
 500   uint fp_args = 0;
 501   uint stk_args = 0; // inc by 2 each time
 502 
 503   for (int i = 0; i < total_args_passed; i++) {
 504     switch (sig_bt[i]) {
 505     case T_BOOLEAN:
 506     case T_CHAR:
 507     case T_BYTE:
 508     case T_SHORT:
 509     case T_INT:
 510       if (int_args < Argument::n_int_register_parameters_j) {
 511         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 512       } else {
 513         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 514         stk_args += 2;
 515       }
 516       break;
 517     case T_VOID:
 518       // halves of T_LONG or T_DOUBLE
 519       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 520       regs[i].set_bad();
 521       break;
 522     case T_LONG:
 523       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 524       // fall through
 525     case T_OBJECT:
 526     case T_ARRAY:
 527     case T_ADDRESS:
 528       if (int_args < Argument::n_int_register_parameters_j) {
 529         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 530       } else {
 531         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 532         stk_args += 2;
 533       }
 534       break;
 535     case T_FLOAT:
 536       if (fp_args < Argument::n_float_register_parameters_j) {
 537         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 538       } else {
 539         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 540         stk_args += 2;
 541       }
 542       break;
 543     case T_DOUBLE:
 544       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 545       if (fp_args < Argument::n_float_register_parameters_j) {
 546         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 547       } else {
 548         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 549         stk_args += 2;
 550       }
 551       break;
 552     default:
 553       ShouldNotReachHere();
 554       break;
 555     }
 556   }
 557 
 558   return align_up(stk_args, 2);
 559 }
 560 
 561 // Patch the callers callsite with entry to compiled code if it exists.
 562 static void patch_callers_callsite(MacroAssembler *masm) {
 563   Label L;
 564   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 565   __ jcc(Assembler::equal, L);
 566 
 567   // Save the current stack pointer
 568   __ mov(r13, rsp);
 569   // Schedule the branch target address early.
 570   // Call into the VM to patch the caller, then jump to compiled callee
 571   // rax isn't live so capture return address while we easily can
 572   __ movptr(rax, Address(rsp, 0));
 573 
 574   // align stack so push_CPU_state doesn't fault
 575   __ andptr(rsp, -(StackAlignmentInBytes));
 576   __ push_CPU_state();
 577   __ vzeroupper();
 578   // VM needs caller's callsite
 579   // VM needs target method
 580   // This needs to be a long call since we will relocate this adapter to
 581   // the codeBuffer and it may not reach
 582 
 583   // Allocate argument register save area
 584   if (frame::arg_reg_save_area_bytes != 0) {
 585     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 586   }
 587   __ mov(c_rarg0, rbx);
 588   __ mov(c_rarg1, rax);
 589   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 590 
 591   // De-allocate argument register save area
 592   if (frame::arg_reg_save_area_bytes != 0) {
 593     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 594   }
 595 
 596   __ vzeroupper();
 597   __ pop_CPU_state();
 598   // restore sp
 599   __ mov(rsp, r13);
 600   __ bind(L);
 601 }
 602 
 603 
 604 static void gen_c2i_adapter(MacroAssembler *masm,
 605                             int total_args_passed,
 606                             int comp_args_on_stack,
 607                             const BasicType *sig_bt,
 608                             const VMRegPair *regs,
 609                             Label& skip_fixup) {
 610   // Before we get into the guts of the C2I adapter, see if we should be here
 611   // at all.  We've come from compiled code and are attempting to jump to the
 612   // interpreter, which means the caller made a static call to get here
 613   // (vcalls always get a compiled target if there is one).  Check for a
 614   // compiled target.  If there is one, we need to patch the caller's call.
 615   patch_callers_callsite(masm);
 616 
 617   __ bind(skip_fixup);
 618 
 619   // Since all args are passed on the stack, total_args_passed *
 620   // Interpreter::stackElementSize is the space we need. Plus 1 because
 621   // we also account for the return address location since
 622   // we store it first rather than hold it in rax across all the shuffling
 623 
 624   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 625 
 626   // stack is aligned, keep it that way
 627   extraspace = align_up(extraspace, 2*wordSize);
 628 
 629   // Get return address
 630   __ pop(rax);
 631 
 632   // set senderSP value
 633   __ mov(r13, rsp);
 634 
 635   __ subptr(rsp, extraspace);
 636 
 637   // Store the return address in the expected location
 638   __ movptr(Address(rsp, 0), rax);
 639 
 640   // Now write the args into the outgoing interpreter space
 641   for (int i = 0; i < total_args_passed; i++) {
 642     if (sig_bt[i] == T_VOID) {
 643       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 644       continue;
 645     }
 646 
 647     // offset to start parameters
 648     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 649     int next_off = st_off - Interpreter::stackElementSize;
 650 
 651     // Say 4 args:
 652     // i   st_off
 653     // 0   32 T_LONG
 654     // 1   24 T_VOID
 655     // 2   16 T_OBJECT
 656     // 3    8 T_BOOL
 657     // -    0 return address
 658     //
 659     // However to make thing extra confusing. Because we can fit a long/double in
 660     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 661     // leaves one slot empty and only stores to a single slot. In this case the
 662     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 663 
 664     VMReg r_1 = regs[i].first();
 665     VMReg r_2 = regs[i].second();
 666     if (!r_1->is_valid()) {
 667       assert(!r_2->is_valid(), "");
 668       continue;
 669     }
 670     if (r_1->is_stack()) {
 671       // memory to memory use rax
 672       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 673       if (!r_2->is_valid()) {
 674         // sign extend??
 675         __ movl(rax, Address(rsp, ld_off));
 676         __ movptr(Address(rsp, st_off), rax);
 677 
 678       } else {
 679 
 680         __ movq(rax, Address(rsp, ld_off));
 681 
 682         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 683         // T_DOUBLE and T_LONG use two slots in the interpreter
 684         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 685           // ld_off == LSW, ld_off+wordSize == MSW
 686           // st_off == MSW, next_off == LSW
 687           __ movq(Address(rsp, next_off), rax);
 688 #ifdef ASSERT
 689           // Overwrite the unused slot with known junk
 690           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 691           __ movptr(Address(rsp, st_off), rax);
 692 #endif /* ASSERT */
 693         } else {
 694           __ movq(Address(rsp, st_off), rax);
 695         }
 696       }
 697     } else if (r_1->is_Register()) {
 698       Register r = r_1->as_Register();
 699       if (!r_2->is_valid()) {
 700         // must be only an int (or less ) so move only 32bits to slot
 701         // why not sign extend??
 702         __ movl(Address(rsp, st_off), r);
 703       } else {
 704         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 705         // T_DOUBLE and T_LONG use two slots in the interpreter
 706         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 707           // long/double in gpr
 708 #ifdef ASSERT
 709           // Overwrite the unused slot with known junk
 710           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 711           __ movptr(Address(rsp, st_off), rax);
 712 #endif /* ASSERT */
 713           __ movq(Address(rsp, next_off), r);
 714         } else {
 715           __ movptr(Address(rsp, st_off), r);
 716         }
 717       }
 718     } else {
 719       assert(r_1->is_XMMRegister(), "");
 720       if (!r_2->is_valid()) {
 721         // only a float use just part of the slot
 722         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 723       } else {
 724 #ifdef ASSERT
 725         // Overwrite the unused slot with known junk
 726         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 727         __ movptr(Address(rsp, st_off), rax);
 728 #endif /* ASSERT */
 729         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 730       }
 731     }
 732   }
 733 
 734   // Schedule the branch target address early.
 735   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 736   __ jmp(rcx);
 737 }
 738 
 739 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 740                         address code_start, address code_end,
 741                         Label& L_ok) {
 742   Label L_fail;
 743   __ lea(temp_reg, ExternalAddress(code_start));
 744   __ cmpptr(pc_reg, temp_reg);
 745   __ jcc(Assembler::belowEqual, L_fail);
 746   __ lea(temp_reg, ExternalAddress(code_end));
 747   __ cmpptr(pc_reg, temp_reg);
 748   __ jcc(Assembler::below, L_ok);
 749   __ bind(L_fail);
 750 }
 751 
 752 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 753                                     int total_args_passed,
 754                                     int comp_args_on_stack,
 755                                     const BasicType *sig_bt,
 756                                     const VMRegPair *regs) {
 757 
 758   // Note: r13 contains the senderSP on entry. We must preserve it since
 759   // we may do a i2c -> c2i transition if we lose a race where compiled
 760   // code goes non-entrant while we get args ready.
 761   // In addition we use r13 to locate all the interpreter args as
 762   // we must align the stack to 16 bytes on an i2c entry else we
 763   // lose alignment we expect in all compiled code and register
 764   // save code can segv when fxsave instructions find improperly
 765   // aligned stack pointer.
 766 
 767   // Adapters can be frameless because they do not require the caller
 768   // to perform additional cleanup work, such as correcting the stack pointer.
 769   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 770   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 771   // even if a callee has modified the stack pointer.
 772   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 773   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 774   // up via the senderSP register).
 775   // In other words, if *either* the caller or callee is interpreted, we can
 776   // get the stack pointer repaired after a call.
 777   // This is why c2i and i2c adapters cannot be indefinitely composed.
 778   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 779   // both caller and callee would be compiled methods, and neither would
 780   // clean up the stack pointer changes performed by the two adapters.
 781   // If this happens, control eventually transfers back to the compiled
 782   // caller, but with an uncorrected stack, causing delayed havoc.
 783 
 784   // Pick up the return address
 785   __ movptr(rax, Address(rsp, 0));
 786 
 787   if (VerifyAdapterCalls &&
 788       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 789     // So, let's test for cascading c2i/i2c adapters right now.
 790     //  assert(Interpreter::contains($return_addr) ||
 791     //         StubRoutines::contains($return_addr),
 792     //         "i2c adapter must return to an interpreter frame");
 793     __ block_comment("verify_i2c { ");
 794     Label L_ok;
 795     if (Interpreter::code() != NULL)
 796       range_check(masm, rax, r11,
 797                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 798                   L_ok);
 799     if (StubRoutines::code1() != NULL)
 800       range_check(masm, rax, r11,
 801                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 802                   L_ok);
 803     if (StubRoutines::code2() != NULL)
 804       range_check(masm, rax, r11,
 805                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 806                   L_ok);
 807     const char* msg = "i2c adapter must return to an interpreter frame";
 808     __ block_comment(msg);
 809     __ stop(msg);
 810     __ bind(L_ok);
 811     __ block_comment("} verify_i2ce ");
 812   }
 813 
 814   // Must preserve original SP for loading incoming arguments because
 815   // we need to align the outgoing SP for compiled code.
 816   __ movptr(r11, rsp);
 817 
 818   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
 819   // in registers, we will occasionally have no stack args.
 820   int comp_words_on_stack = 0;
 821   if (comp_args_on_stack) {
 822     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
 823     // registers are below.  By subtracting stack0, we either get a negative
 824     // number (all values in registers) or the maximum stack slot accessed.
 825 
 826     // Convert 4-byte c2 stack slots to words.
 827     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 828     // Round up to miminum stack alignment, in wordSize
 829     comp_words_on_stack = align_up(comp_words_on_stack, 2);
 830     __ subptr(rsp, comp_words_on_stack * wordSize);
 831   }
 832 
 833 
 834   // Ensure compiled code always sees stack at proper alignment
 835   __ andptr(rsp, -16);
 836 
 837   // push the return address and misalign the stack that youngest frame always sees
 838   // as far as the placement of the call instruction
 839   __ push(rax);
 840 
 841   // Put saved SP in another register
 842   const Register saved_sp = rax;
 843   __ movptr(saved_sp, r11);
 844 
 845   // Will jump to the compiled code just as if compiled code was doing it.
 846   // Pre-load the register-jump target early, to schedule it better.
 847   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 848 
 849 #if INCLUDE_JVMCI
 850   if (EnableJVMCI) {
 851     // check if this call should be routed towards a specific entry point
 852     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 853     Label no_alternative_target;
 854     __ jcc(Assembler::equal, no_alternative_target);
 855     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 856     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 857     __ bind(no_alternative_target);
 858   }
 859 #endif // INCLUDE_JVMCI
 860 
 861   // Now generate the shuffle code.  Pick up all register args and move the
 862   // rest through the floating point stack top.
 863   for (int i = 0; i < total_args_passed; i++) {
 864     if (sig_bt[i] == T_VOID) {
 865       // Longs and doubles are passed in native word order, but misaligned
 866       // in the 32-bit build.
 867       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 868       continue;
 869     }
 870 
 871     // Pick up 0, 1 or 2 words from SP+offset.
 872 
 873     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 874             "scrambled load targets?");
 875     // Load in argument order going down.
 876     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 877     // Point to interpreter value (vs. tag)
 878     int next_off = ld_off - Interpreter::stackElementSize;
 879     //
 880     //
 881     //
 882     VMReg r_1 = regs[i].first();
 883     VMReg r_2 = regs[i].second();
 884     if (!r_1->is_valid()) {
 885       assert(!r_2->is_valid(), "");
 886       continue;
 887     }
 888     if (r_1->is_stack()) {
 889       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 890       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 891 
 892       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 893       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 894       // will be generated.
 895       if (!r_2->is_valid()) {
 896         // sign extend???
 897         __ movl(r13, Address(saved_sp, ld_off));
 898         __ movptr(Address(rsp, st_off), r13);
 899       } else {
 900         //
 901         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 902         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 903         // So we must adjust where to pick up the data to match the interpreter.
 904         //
 905         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 906         // are accessed as negative so LSW is at LOW address
 907 
 908         // ld_off is MSW so get LSW
 909         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 910                            next_off : ld_off;
 911         __ movq(r13, Address(saved_sp, offset));
 912         // st_off is LSW (i.e. reg.first())
 913         __ movq(Address(rsp, st_off), r13);
 914       }
 915     } else if (r_1->is_Register()) {  // Register argument
 916       Register r = r_1->as_Register();
 917       assert(r != rax, "must be different");
 918       if (r_2->is_valid()) {
 919         //
 920         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 921         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 922         // So we must adjust where to pick up the data to match the interpreter.
 923 
 924         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 925                            next_off : ld_off;
 926 
 927         // this can be a misaligned move
 928         __ movq(r, Address(saved_sp, offset));
 929       } else {
 930         // sign extend and use a full word?
 931         __ movl(r, Address(saved_sp, ld_off));
 932       }
 933     } else {
 934       if (!r_2->is_valid()) {
 935         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 936       } else {
 937         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 938       }
 939     }
 940   }
 941 
 942   // 6243940 We might end up in handle_wrong_method if
 943   // the callee is deoptimized as we race thru here. If that
 944   // happens we don't want to take a safepoint because the
 945   // caller frame will look interpreted and arguments are now
 946   // "compiled" so it is much better to make this transition
 947   // invisible to the stack walking code. Unfortunately if
 948   // we try and find the callee by normal means a safepoint
 949   // is possible. So we stash the desired callee in the thread
 950   // and the vm will find there should this case occur.
 951 
 952   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 953 
 954   // put Method* where a c2i would expect should we end up there
 955   // only needed becaus eof c2 resolve stubs return Method* as a result in
 956   // rax
 957   __ mov(rax, rbx);
 958   __ jmp(r11);
 959 }
 960 
 961 // ---------------------------------------------------------------
 962 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 963                                                             int total_args_passed,
 964                                                             int comp_args_on_stack,
 965                                                             const BasicType *sig_bt,
 966                                                             const VMRegPair *regs,
 967                                                             AdapterFingerPrint* fingerprint) {
 968   address i2c_entry = __ pc();
 969 
 970   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 971 
 972   // -------------------------------------------------------------------------
 973   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 974   // to the interpreter.  The args start out packed in the compiled layout.  They
 975   // need to be unpacked into the interpreter layout.  This will almost always
 976   // require some stack space.  We grow the current (compiled) stack, then repack
 977   // the args.  We  finally end in a jump to the generic interpreter entry point.
 978   // On exit from the interpreter, the interpreter will restore our SP (lest the
 979   // compiled code, which relys solely on SP and not RBP, get sick).
 980 
 981   address c2i_unverified_entry = __ pc();
 982   Label skip_fixup;
 983   Label ok;
 984 
 985   Register holder = rax;
 986   Register receiver = j_rarg0;
 987   Register temp = rbx;
 988 
 989   {
 990     __ load_klass(temp, receiver, rscratch1);
 991     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
 992     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
 993     __ jcc(Assembler::equal, ok);
 994     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 995 
 996     __ bind(ok);
 997     // Method might have been compiled since the call site was patched to
 998     // interpreted if that is the case treat it as a miss so we can get
 999     // the call site corrected.
1000     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1001     __ jcc(Assembler::equal, skip_fixup);
1002     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1003   }
1004 
1005   address c2i_entry = __ pc();
1006 
1007   // Class initialization barrier for static methods
1008   address c2i_no_clinit_check_entry = NULL;
1009   if (VM_Version::supports_fast_class_init_checks()) {
1010     Label L_skip_barrier;
1011     Register method = rbx;
1012 
1013     { // Bypass the barrier for non-static methods
1014       Register flags  = rscratch1;
1015       __ movl(flags, Address(method, Method::access_flags_offset()));
1016       __ testl(flags, JVM_ACC_STATIC);
1017       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1018     }
1019 
1020     Register klass = rscratch1;
1021     __ load_method_holder(klass, method);
1022     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1023 
1024     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1025 
1026     __ bind(L_skip_barrier);
1027     c2i_no_clinit_check_entry = __ pc();
1028   }
1029 
1030   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1031   bs->c2i_entry_barrier(masm);
1032 
1033   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1034 
1035   __ flush();
1036   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1037 }
1038 
1039 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1040                                          VMRegPair *regs,
1041                                          VMRegPair *regs2,
1042                                          int total_args_passed) {
1043   assert(regs2 == NULL, "not needed on x86");
1044 // We return the amount of VMRegImpl stack slots we need to reserve for all
1045 // the arguments NOT counting out_preserve_stack_slots.
1046 
1047 // NOTE: These arrays will have to change when c1 is ported
1048 #ifdef _WIN64
1049     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1050       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1051     };
1052     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1053       c_farg0, c_farg1, c_farg2, c_farg3
1054     };
1055 #else
1056     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1057       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1058     };
1059     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1060       c_farg0, c_farg1, c_farg2, c_farg3,
1061       c_farg4, c_farg5, c_farg6, c_farg7
1062     };
1063 #endif // _WIN64
1064 
1065 
1066     uint int_args = 0;
1067     uint fp_args = 0;
1068     uint stk_args = 0; // inc by 2 each time
1069 
1070     for (int i = 0; i < total_args_passed; i++) {
1071       switch (sig_bt[i]) {
1072       case T_BOOLEAN:
1073       case T_CHAR:
1074       case T_BYTE:
1075       case T_SHORT:
1076       case T_INT:
1077         if (int_args < Argument::n_int_register_parameters_c) {
1078           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1079 #ifdef _WIN64
1080           fp_args++;
1081           // Allocate slots for callee to stuff register args the stack.
1082           stk_args += 2;
1083 #endif
1084         } else {
1085           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1086           stk_args += 2;
1087         }
1088         break;
1089       case T_LONG:
1090         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1091         // fall through
1092       case T_OBJECT:
1093       case T_ARRAY:
1094       case T_ADDRESS:
1095       case T_METADATA:
1096         if (int_args < Argument::n_int_register_parameters_c) {
1097           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1098 #ifdef _WIN64
1099           fp_args++;
1100           stk_args += 2;
1101 #endif
1102         } else {
1103           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1104           stk_args += 2;
1105         }
1106         break;
1107       case T_FLOAT:
1108         if (fp_args < Argument::n_float_register_parameters_c) {
1109           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1110 #ifdef _WIN64
1111           int_args++;
1112           // Allocate slots for callee to stuff register args the stack.
1113           stk_args += 2;
1114 #endif
1115         } else {
1116           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1117           stk_args += 2;
1118         }
1119         break;
1120       case T_DOUBLE:
1121         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1122         if (fp_args < Argument::n_float_register_parameters_c) {
1123           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1124 #ifdef _WIN64
1125           int_args++;
1126           // Allocate slots for callee to stuff register args the stack.
1127           stk_args += 2;
1128 #endif
1129         } else {
1130           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1131           stk_args += 2;
1132         }
1133         break;
1134       case T_VOID: // Halves of longs and doubles
1135         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1136         regs[i].set_bad();
1137         break;
1138       default:
1139         ShouldNotReachHere();
1140         break;
1141       }
1142     }
1143 #ifdef _WIN64
1144   // windows abi requires that we always allocate enough stack space
1145   // for 4 64bit registers to be stored down.
1146   if (stk_args < 8) {
1147     stk_args = 8;
1148   }
1149 #endif // _WIN64
1150 
1151   return stk_args;
1152 }
1153 
1154 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1155                                              uint num_bits,
1156                                              uint total_args_passed) {
1157   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1158          "only certain vector sizes are supported for now");
1159 
1160   static const XMMRegister VEC_ArgReg[32] = {
1161      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1162      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1163     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1164     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1165   };
1166 
1167   uint stk_args = 0;
1168   uint fp_args = 0;
1169 
1170   for (uint i = 0; i < total_args_passed; i++) {
1171     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1172     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1173     regs[i].set_pair(vmreg->next(next_val), vmreg);
1174   }
1175 
1176   return stk_args;
1177 }
1178 
1179 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1180   // We always ignore the frame_slots arg and just use the space just below frame pointer
1181   // which by this time is free to use
1182   switch (ret_type) {
1183   case T_FLOAT:
1184     __ movflt(Address(rbp, -wordSize), xmm0);
1185     break;
1186   case T_DOUBLE:
1187     __ movdbl(Address(rbp, -wordSize), xmm0);
1188     break;
1189   case T_VOID:  break;
1190   default: {
1191     __ movptr(Address(rbp, -wordSize), rax);
1192     }
1193   }
1194 }
1195 
1196 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1197   // We always ignore the frame_slots arg and just use the space just below frame pointer
1198   // which by this time is free to use
1199   switch (ret_type) {
1200   case T_FLOAT:
1201     __ movflt(xmm0, Address(rbp, -wordSize));
1202     break;
1203   case T_DOUBLE:
1204     __ movdbl(xmm0, Address(rbp, -wordSize));
1205     break;
1206   case T_VOID:  break;
1207   default: {
1208     __ movptr(rax, Address(rbp, -wordSize));
1209     }
1210   }
1211 }
1212 
1213 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1214     for ( int i = first_arg ; i < arg_count ; i++ ) {
1215       if (args[i].first()->is_Register()) {
1216         __ push(args[i].first()->as_Register());
1217       } else if (args[i].first()->is_XMMRegister()) {
1218         __ subptr(rsp, 2*wordSize);
1219         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1220       }
1221     }
1222 }
1223 
1224 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1225     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1226       if (args[i].first()->is_Register()) {
1227         __ pop(args[i].first()->as_Register());
1228       } else if (args[i].first()->is_XMMRegister()) {
1229         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1230         __ addptr(rsp, 2*wordSize);
1231       }
1232     }
1233 }
1234 
1235 // Unpack an array argument into a pointer to the body and the length
1236 // if the array is non-null, otherwise pass 0 for both.
1237 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
1238   Register tmp_reg = rax;
1239   assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
1240          "possible collision");
1241   assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
1242          "possible collision");
1243 
1244   __ block_comment("unpack_array_argument {");
1245 
1246   // Pass the length, ptr pair
1247   Label is_null, done;
1248   VMRegPair tmp;
1249   tmp.set_ptr(tmp_reg->as_VMReg());
1250   if (reg.first()->is_stack()) {
1251     // Load the arg up from the stack
1252     __ move_ptr(reg, tmp);
1253     reg = tmp;
1254   }
1255   __ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1256   __ jccb(Assembler::equal, is_null);
1257   __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1258   __ move_ptr(tmp, body_arg);
1259   // load the length relative to the body.
1260   __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() -
1261                            arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1262   __ move32_64(tmp, length_arg);
1263   __ jmpb(done);
1264   __ bind(is_null);
1265   // Pass zeros
1266   __ xorptr(tmp_reg, tmp_reg);
1267   __ move_ptr(tmp, body_arg);
1268   __ move32_64(tmp, length_arg);
1269   __ bind(done);
1270 
1271   __ block_comment("} unpack_array_argument");
1272 }
1273 
1274 
1275 // Different signatures may require very different orders for the move
1276 // to avoid clobbering other arguments.  There's no simple way to
1277 // order them safely.  Compute a safe order for issuing stores and
1278 // break any cycles in those stores.  This code is fairly general but
1279 // it's not necessary on the other platforms so we keep it in the
1280 // platform dependent code instead of moving it into a shared file.
1281 // (See bugs 7013347 & 7145024.)
1282 // Note that this code is specific to LP64.
1283 class ComputeMoveOrder: public StackObj {
1284   class MoveOperation: public ResourceObj {
1285     friend class ComputeMoveOrder;
1286    private:
1287     VMRegPair        _src;
1288     VMRegPair        _dst;
1289     int              _src_index;
1290     int              _dst_index;
1291     bool             _processed;
1292     MoveOperation*  _next;
1293     MoveOperation*  _prev;
1294 
1295     static int get_id(VMRegPair r) {
1296       return r.first()->value();
1297     }
1298 
1299    public:
1300     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1301       _src(src)
1302     , _dst(dst)
1303     , _src_index(src_index)
1304     , _dst_index(dst_index)
1305     , _processed(false)
1306     , _next(NULL)
1307     , _prev(NULL) {
1308     }
1309 
1310     VMRegPair src() const              { return _src; }
1311     int src_id() const                 { return get_id(src()); }
1312     int src_index() const              { return _src_index; }
1313     VMRegPair dst() const              { return _dst; }
1314     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1315     int dst_index() const              { return _dst_index; }
1316     int dst_id() const                 { return get_id(dst()); }
1317     MoveOperation* next() const       { return _next; }
1318     MoveOperation* prev() const       { return _prev; }
1319     void set_processed()               { _processed = true; }
1320     bool is_processed() const          { return _processed; }
1321 
1322     // insert
1323     void break_cycle(VMRegPair temp_register) {
1324       // create a new store following the last store
1325       // to move from the temp_register to the original
1326       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1327 
1328       // break the cycle of links and insert new_store at the end
1329       // break the reverse link.
1330       MoveOperation* p = prev();
1331       assert(p->next() == this, "must be");
1332       _prev = NULL;
1333       p->_next = new_store;
1334       new_store->_prev = p;
1335 
1336       // change the original store to save it's value in the temp.
1337       set_dst(-1, temp_register);
1338     }
1339 
1340     void link(GrowableArray<MoveOperation*>& killer) {
1341       // link this store in front the store that it depends on
1342       MoveOperation* n = killer.at_grow(src_id(), NULL);
1343       if (n != NULL) {
1344         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1345         _next = n;
1346         n->_prev = this;
1347       }
1348     }
1349   };
1350 
1351  private:
1352   GrowableArray<MoveOperation*> edges;
1353 
1354  public:
1355   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1356                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1357     // Move operations where the dest is the stack can all be
1358     // scheduled first since they can't interfere with the other moves.
1359     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1360       if (in_sig_bt[i] == T_ARRAY) {
1361         c_arg--;
1362         if (out_regs[c_arg].first()->is_stack() &&
1363             out_regs[c_arg + 1].first()->is_stack()) {
1364           arg_order.push(i);
1365           arg_order.push(c_arg);
1366         } else {
1367           if (out_regs[c_arg].first()->is_stack() ||
1368               in_regs[i].first() == out_regs[c_arg].first()) {
1369             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1370           } else {
1371             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1372           }
1373         }
1374       } else if (in_sig_bt[i] == T_VOID) {
1375         arg_order.push(i);
1376         arg_order.push(c_arg);
1377       } else {
1378         if (out_regs[c_arg].first()->is_stack() ||
1379             in_regs[i].first() == out_regs[c_arg].first()) {
1380           arg_order.push(i);
1381           arg_order.push(c_arg);
1382         } else {
1383           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1384         }
1385       }
1386     }
1387     // Break any cycles in the register moves and emit the in the
1388     // proper order.
1389     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1390     for (int i = 0; i < stores->length(); i++) {
1391       arg_order.push(stores->at(i)->src_index());
1392       arg_order.push(stores->at(i)->dst_index());
1393     }
1394  }
1395 
1396   // Collected all the move operations
1397   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1398     if (src.first() == dst.first()) return;
1399     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1400   }
1401 
1402   // Walk the edges breaking cycles between moves.  The result list
1403   // can be walked in order to produce the proper set of loads
1404   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1405     // Record which moves kill which values
1406     GrowableArray<MoveOperation*> killer;
1407     for (int i = 0; i < edges.length(); i++) {
1408       MoveOperation* s = edges.at(i);
1409       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1410       killer.at_put_grow(s->dst_id(), s, NULL);
1411     }
1412     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1413            "make sure temp isn't in the registers that are killed");
1414 
1415     // create links between loads and stores
1416     for (int i = 0; i < edges.length(); i++) {
1417       edges.at(i)->link(killer);
1418     }
1419 
1420     // at this point, all the move operations are chained together
1421     // in a doubly linked list.  Processing it backwards finds
1422     // the beginning of the chain, forwards finds the end.  If there's
1423     // a cycle it can be broken at any point,  so pick an edge and walk
1424     // backward until the list ends or we end where we started.
1425     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1426     for (int e = 0; e < edges.length(); e++) {
1427       MoveOperation* s = edges.at(e);
1428       if (!s->is_processed()) {
1429         MoveOperation* start = s;
1430         // search for the beginning of the chain or cycle
1431         while (start->prev() != NULL && start->prev() != s) {
1432           start = start->prev();
1433         }
1434         if (start->prev() == s) {
1435           start->break_cycle(temp_register);
1436         }
1437         // walk the chain forward inserting to store list
1438         while (start != NULL) {
1439           stores->append(start);
1440           start->set_processed();
1441           start = start->next();
1442         }
1443       }
1444     }
1445     return stores;
1446   }
1447 };
1448 
1449 static void verify_oop_args(MacroAssembler* masm,
1450                             const methodHandle& method,
1451                             const BasicType* sig_bt,
1452                             const VMRegPair* regs) {
1453   Register temp_reg = rbx;  // not part of any compiled calling seq
1454   if (VerifyOops) {
1455     for (int i = 0; i < method->size_of_parameters(); i++) {
1456       if (is_reference_type(sig_bt[i])) {
1457         VMReg r = regs[i].first();
1458         assert(r->is_valid(), "bad oop arg");
1459         if (r->is_stack()) {
1460           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1461           __ verify_oop(temp_reg);
1462         } else {
1463           __ verify_oop(r->as_Register());
1464         }
1465       }
1466     }
1467   }
1468 }
1469 
1470 static void gen_special_dispatch(MacroAssembler* masm,
1471                                  const methodHandle& method,
1472                                  const BasicType* sig_bt,
1473                                  const VMRegPair* regs) {
1474   verify_oop_args(masm, method, sig_bt, regs);
1475   vmIntrinsics::ID iid = method->intrinsic_id();
1476 
1477   // Now write the args into the outgoing interpreter space
1478   bool     has_receiver   = false;
1479   Register receiver_reg   = noreg;
1480   int      member_arg_pos = -1;
1481   Register member_reg     = noreg;
1482   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1483   if (ref_kind != 0) {
1484     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1485     member_reg = rbx;  // known to be free at this point
1486     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1487   } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1488     has_receiver = true;
1489   } else {
1490     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1491   }
1492 
1493   if (member_reg != noreg) {
1494     // Load the member_arg into register, if necessary.
1495     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1496     VMReg r = regs[member_arg_pos].first();
1497     if (r->is_stack()) {
1498       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1499     } else {
1500       // no data motion is needed
1501       member_reg = r->as_Register();
1502     }
1503   }
1504 
1505   if (has_receiver) {
1506     // Make sure the receiver is loaded into a register.
1507     assert(method->size_of_parameters() > 0, "oob");
1508     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1509     VMReg r = regs[0].first();
1510     assert(r->is_valid(), "bad receiver arg");
1511     if (r->is_stack()) {
1512       // Porting note:  This assumes that compiled calling conventions always
1513       // pass the receiver oop in a register.  If this is not true on some
1514       // platform, pick a temp and load the receiver from stack.
1515       fatal("receiver always in a register");
1516       receiver_reg = j_rarg0;  // known to be free at this point
1517       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1518     } else {
1519       // no data motion is needed
1520       receiver_reg = r->as_Register();
1521     }
1522   }
1523 
1524   // Figure out which address we are really jumping to:
1525   MethodHandles::generate_method_handle_dispatch(masm, iid,
1526                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1527 }
1528 
1529 // ---------------------------------------------------------------------------
1530 // Generate a native wrapper for a given method.  The method takes arguments
1531 // in the Java compiled code convention, marshals them to the native
1532 // convention (handlizes oops, etc), transitions to native, makes the call,
1533 // returns to java state (possibly blocking), unhandlizes any result and
1534 // returns.
1535 //
1536 // Critical native functions are a shorthand for the use of
1537 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1538 // functions.  The wrapper is expected to unpack the arguments before
1539 // passing them to the callee. Critical native functions leave the state _in_Java,
1540 // since they cannot stop for GC.
1541 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1542 // block and the check for pending exceptions it's impossible for them
1543 // to be thrown.
1544 //
1545 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1546                                                 const methodHandle& method,
1547                                                 int compile_id,
1548                                                 BasicType* in_sig_bt,
1549                                                 VMRegPair* in_regs,
1550                                                 BasicType ret_type,
1551                                                 address critical_entry) {
1552   if (method->is_method_handle_intrinsic()) {
1553     vmIntrinsics::ID iid = method->intrinsic_id();
1554     intptr_t start = (intptr_t)__ pc();
1555     int vep_offset = ((intptr_t)__ pc()) - start;
1556     gen_special_dispatch(masm,
1557                          method,
1558                          in_sig_bt,
1559                          in_regs);
1560     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1561     __ flush();
1562     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1563     return nmethod::new_native_nmethod(method,
1564                                        compile_id,
1565                                        masm->code(),
1566                                        vep_offset,
1567                                        frame_complete,
1568                                        stack_slots / VMRegImpl::slots_per_word,
1569                                        in_ByteSize(-1),
1570                                        in_ByteSize(-1),
1571                                        (OopMapSet*)NULL);
1572   }
1573   bool is_critical_native = true;
1574   address native_func = critical_entry;
1575   if (native_func == NULL) {
1576     native_func = method->native_function();
1577     is_critical_native = false;
1578   }
1579   assert(native_func != NULL, "must have function");
1580 
1581   // An OopMap for lock (and class if static)
1582   OopMapSet *oop_maps = new OopMapSet();
1583   intptr_t start = (intptr_t)__ pc();
1584 
1585   // We have received a description of where all the java arg are located
1586   // on entry to the wrapper. We need to convert these args to where
1587   // the jni function will expect them. To figure out where they go
1588   // we convert the java signature to a C signature by inserting
1589   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1590 
1591   const int total_in_args = method->size_of_parameters();
1592   int total_c_args = total_in_args;
1593   if (!is_critical_native) {
1594     total_c_args += 1;
1595     if (method->is_static()) {
1596       total_c_args++;
1597     }
1598   } else {
1599     for (int i = 0; i < total_in_args; i++) {
1600       if (in_sig_bt[i] == T_ARRAY) {
1601         total_c_args++;
1602       }
1603     }
1604   }
1605 
1606   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1607   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1608   BasicType* in_elem_bt = NULL;
1609 
1610   int argc = 0;
1611   if (!is_critical_native) {
1612     out_sig_bt[argc++] = T_ADDRESS;
1613     if (method->is_static()) {
1614       out_sig_bt[argc++] = T_OBJECT;
1615     }
1616 
1617     for (int i = 0; i < total_in_args ; i++ ) {
1618       out_sig_bt[argc++] = in_sig_bt[i];
1619     }
1620   } else {
1621     in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
1622     SignatureStream ss(method->signature());
1623     for (int i = 0; i < total_in_args ; i++ ) {
1624       if (in_sig_bt[i] == T_ARRAY) {
1625         // Arrays are passed as int, elem* pair
1626         out_sig_bt[argc++] = T_INT;
1627         out_sig_bt[argc++] = T_ADDRESS;
1628         ss.skip_array_prefix(1);  // skip one '['
1629         assert(ss.is_primitive(), "primitive type expected");
1630         in_elem_bt[i] = ss.type();
1631       } else {
1632         out_sig_bt[argc++] = in_sig_bt[i];
1633         in_elem_bt[i] = T_VOID;
1634       }
1635       if (in_sig_bt[i] != T_VOID) {
1636         assert(in_sig_bt[i] == ss.type() ||
1637                in_sig_bt[i] == T_ARRAY, "must match");
1638         ss.next();
1639       }
1640     }
1641   }
1642 
1643   // Now figure out where the args must be stored and how much stack space
1644   // they require.
1645   int out_arg_slots;
1646   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1647 
1648   // Compute framesize for the wrapper.  We need to handlize all oops in
1649   // incoming registers
1650 
1651   // Calculate the total number of stack slots we will need.
1652 
1653   // First count the abi requirement plus all of the outgoing args
1654   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1655 
1656   // Now the space for the inbound oop handle area
1657   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1658   if (is_critical_native) {
1659     // Critical natives may have to call out so they need a save area
1660     // for register arguments.
1661     int double_slots = 0;
1662     int single_slots = 0;
1663     for ( int i = 0; i < total_in_args; i++) {
1664       if (in_regs[i].first()->is_Register()) {
1665         const Register reg = in_regs[i].first()->as_Register();
1666         switch (in_sig_bt[i]) {
1667           case T_BOOLEAN:
1668           case T_BYTE:
1669           case T_SHORT:
1670           case T_CHAR:
1671           case T_INT:  single_slots++; break;
1672           case T_ARRAY:  // specific to LP64 (7145024)
1673           case T_LONG: double_slots++; break;
1674           default:  ShouldNotReachHere();
1675         }
1676       } else if (in_regs[i].first()->is_XMMRegister()) {
1677         switch (in_sig_bt[i]) {
1678           case T_FLOAT:  single_slots++; break;
1679           case T_DOUBLE: double_slots++; break;
1680           default:  ShouldNotReachHere();
1681         }
1682       } else if (in_regs[i].first()->is_FloatRegister()) {
1683         ShouldNotReachHere();
1684       }
1685     }
1686     total_save_slots = double_slots * 2 + single_slots;
1687     // align the save area
1688     if (double_slots != 0) {
1689       stack_slots = align_up(stack_slots, 2);
1690     }
1691   }
1692 
1693   int oop_handle_offset = stack_slots;
1694   stack_slots += total_save_slots;
1695 
1696   // Now any space we need for handlizing a klass if static method
1697 
1698   int klass_slot_offset = 0;
1699   int klass_offset = -1;
1700   int lock_slot_offset = 0;
1701   bool is_static = false;
1702 
1703   if (method->is_static()) {
1704     klass_slot_offset = stack_slots;
1705     stack_slots += VMRegImpl::slots_per_word;
1706     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1707     is_static = true;
1708   }
1709 
1710   // Plus a lock if needed
1711 
1712   if (method->is_synchronized()) {
1713     lock_slot_offset = stack_slots;
1714     stack_slots += VMRegImpl::slots_per_word;
1715   }
1716 
1717   // Now a place (+2) to save return values or temp during shuffling
1718   // + 4 for return address (which we own) and saved rbp
1719   stack_slots += 6;
1720 
1721   // Ok The space we have allocated will look like:
1722   //
1723   //
1724   // FP-> |                     |
1725   //      |---------------------|
1726   //      | 2 slots for moves   |
1727   //      |---------------------|
1728   //      | lock box (if sync)  |
1729   //      |---------------------| <- lock_slot_offset
1730   //      | klass (if static)   |
1731   //      |---------------------| <- klass_slot_offset
1732   //      | oopHandle area      |
1733   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1734   //      | outbound memory     |
1735   //      | based arguments     |
1736   //      |                     |
1737   //      |---------------------|
1738   //      |                     |
1739   // SP-> | out_preserved_slots |
1740   //
1741   //
1742 
1743 
1744   // Now compute actual number of stack words we need rounding to make
1745   // stack properly aligned.
1746   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1747 
1748   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1749 
1750   // First thing make an ic check to see if we should even be here
1751 
1752   // We are free to use all registers as temps without saving them and
1753   // restoring them except rbp. rbp is the only callee save register
1754   // as far as the interpreter and the compiler(s) are concerned.
1755 
1756 
1757   const Register ic_reg = rax;
1758   const Register receiver = j_rarg0;
1759 
1760   Label hit;
1761   Label exception_pending;
1762 
1763   assert_different_registers(ic_reg, receiver, rscratch1);
1764   __ verify_oop(receiver);
1765   __ load_klass(rscratch1, receiver, rscratch2);
1766   __ cmpq(ic_reg, rscratch1);
1767   __ jcc(Assembler::equal, hit);
1768 
1769   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1770 
1771   // Verified entry point must be aligned
1772   __ align(8);
1773 
1774   __ bind(hit);
1775 
1776   int vep_offset = ((intptr_t)__ pc()) - start;
1777 
1778   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1779     Label L_skip_barrier;
1780     Register klass = r10;
1781     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1782     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1783 
1784     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1785 
1786     __ bind(L_skip_barrier);
1787   }
1788 
1789 #ifdef COMPILER1
1790   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1791   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1792     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1793   }
1794 #endif // COMPILER1
1795 
1796   // The instruction at the verified entry point must be 5 bytes or longer
1797   // because it can be patched on the fly by make_non_entrant. The stack bang
1798   // instruction fits that requirement.
1799 
1800   // Generate stack overflow check
1801   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1802 
1803   // Generate a new frame for the wrapper.
1804   __ enter();
1805   // -2 because return address is already present and so is saved rbp
1806   __ subptr(rsp, stack_size - 2*wordSize);
1807 
1808   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1809   bs->nmethod_entry_barrier(masm);
1810 
1811   // Frame is now completed as far as size and linkage.
1812   int frame_complete = ((intptr_t)__ pc()) - start;
1813 
1814     if (UseRTMLocking) {
1815       // Abort RTM transaction before calling JNI
1816       // because critical section will be large and will be
1817       // aborted anyway. Also nmethod could be deoptimized.
1818       __ xabort(0);
1819     }
1820 
1821 #ifdef ASSERT
1822     {
1823       Label L;
1824       __ mov(rax, rsp);
1825       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1826       __ cmpptr(rax, rsp);
1827       __ jcc(Assembler::equal, L);
1828       __ stop("improperly aligned stack");
1829       __ bind(L);
1830     }
1831 #endif /* ASSERT */
1832 
1833 
1834   // We use r14 as the oop handle for the receiver/klass
1835   // It is callee save so it survives the call to native
1836 
1837   const Register oop_handle_reg = r14;
1838 
1839   //
1840   // We immediately shuffle the arguments so that any vm call we have to
1841   // make from here on out (sync slow path, jvmti, etc.) we will have
1842   // captured the oops from our caller and have a valid oopMap for
1843   // them.
1844 
1845   // -----------------
1846   // The Grand Shuffle
1847 
1848   // The Java calling convention is either equal (linux) or denser (win64) than the
1849   // c calling convention. However the because of the jni_env argument the c calling
1850   // convention always has at least one more (and two for static) arguments than Java.
1851   // Therefore if we move the args from java -> c backwards then we will never have
1852   // a register->register conflict and we don't have to build a dependency graph
1853   // and figure out how to break any cycles.
1854   //
1855 
1856   // Record esp-based slot for receiver on stack for non-static methods
1857   int receiver_offset = -1;
1858 
1859   // This is a trick. We double the stack slots so we can claim
1860   // the oops in the caller's frame. Since we are sure to have
1861   // more args than the caller doubling is enough to make
1862   // sure we can capture all the incoming oop args from the
1863   // caller.
1864   //
1865   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1866 
1867   // Mark location of rbp (someday)
1868   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1869 
1870   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1871   // All inbound args are referenced based on rbp and all outbound args via rsp.
1872 
1873 
1874 #ifdef ASSERT
1875   bool reg_destroyed[RegisterImpl::number_of_registers];
1876   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1877   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1878     reg_destroyed[r] = false;
1879   }
1880   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1881     freg_destroyed[f] = false;
1882   }
1883 
1884 #endif /* ASSERT */
1885 
1886   // This may iterate in two different directions depending on the
1887   // kind of native it is.  The reason is that for regular JNI natives
1888   // the incoming and outgoing registers are offset upwards and for
1889   // critical natives they are offset down.
1890   GrowableArray<int> arg_order(2 * total_in_args);
1891 
1892   VMRegPair tmp_vmreg;
1893   tmp_vmreg.set2(rbx->as_VMReg());
1894 
1895   if (!is_critical_native) {
1896     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1897       arg_order.push(i);
1898       arg_order.push(c_arg);
1899     }
1900   } else {
1901     // Compute a valid move order, using tmp_vmreg to break any cycles
1902     ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
1903   }
1904 
1905   int temploc = -1;
1906   for (int ai = 0; ai < arg_order.length(); ai += 2) {
1907     int i = arg_order.at(ai);
1908     int c_arg = arg_order.at(ai + 1);
1909     __ block_comment(err_msg("move %d -> %d", i, c_arg));
1910     if (c_arg == -1) {
1911       assert(is_critical_native, "should only be required for critical natives");
1912       // This arg needs to be moved to a temporary
1913       __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
1914       in_regs[i] = tmp_vmreg;
1915       temploc = i;
1916       continue;
1917     } else if (i == -1) {
1918       assert(is_critical_native, "should only be required for critical natives");
1919       // Read from the temporary location
1920       assert(temploc != -1, "must be valid");
1921       i = temploc;
1922       temploc = -1;
1923     }
1924 #ifdef ASSERT
1925     if (in_regs[i].first()->is_Register()) {
1926       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1927     } else if (in_regs[i].first()->is_XMMRegister()) {
1928       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1929     }
1930     if (out_regs[c_arg].first()->is_Register()) {
1931       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1932     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1933       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1934     }
1935 #endif /* ASSERT */
1936     switch (in_sig_bt[i]) {
1937       case T_ARRAY:
1938         if (is_critical_native) {
1939           unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
1940           c_arg++;
1941 #ifdef ASSERT
1942           if (out_regs[c_arg].first()->is_Register()) {
1943             reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1944           } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1945             freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1946           }
1947 #endif
1948           break;
1949         }
1950       case T_OBJECT:
1951         assert(!is_critical_native, "no oop arguments");
1952         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1953                     ((i == 0) && (!is_static)),
1954                     &receiver_offset);
1955         break;
1956       case T_VOID:
1957         break;
1958 
1959       case T_FLOAT:
1960         __ float_move(in_regs[i], out_regs[c_arg]);
1961           break;
1962 
1963       case T_DOUBLE:
1964         assert( i + 1 < total_in_args &&
1965                 in_sig_bt[i + 1] == T_VOID &&
1966                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1967         __ double_move(in_regs[i], out_regs[c_arg]);
1968         break;
1969 
1970       case T_LONG :
1971         __ long_move(in_regs[i], out_regs[c_arg]);
1972         break;
1973 
1974       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1975 
1976       default:
1977         __ move32_64(in_regs[i], out_regs[c_arg]);
1978     }
1979   }
1980 
1981   int c_arg;
1982 
1983   // Pre-load a static method's oop into r14.  Used both by locking code and
1984   // the normal JNI call code.
1985   if (!is_critical_native) {
1986     // point c_arg at the first arg that is already loaded in case we
1987     // need to spill before we call out
1988     c_arg = total_c_args - total_in_args;
1989 
1990     if (method->is_static()) {
1991 
1992       //  load oop into a register
1993       __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1994 
1995       // Now handlize the static class mirror it's known not-null.
1996       __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1997       map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1998 
1999       // Now get the handle
2000       __ lea(oop_handle_reg, Address(rsp, klass_offset));
2001       // store the klass handle as second argument
2002       __ movptr(c_rarg1, oop_handle_reg);
2003       // and protect the arg if we must spill
2004       c_arg--;
2005     }
2006   } else {
2007     // For JNI critical methods we need to save all registers in save_args.
2008     c_arg = 0;
2009   }
2010 
2011   // Change state to native (we save the return address in the thread, since it might not
2012   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2013   // points into the right code segment. It does not have to be the correct return pc.
2014   // We use the same pc/oopMap repeatedly when we call out
2015 
2016   intptr_t the_pc = (intptr_t) __ pc();
2017   oop_maps->add_gc_map(the_pc - start, map);
2018 
2019   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
2020 
2021 
2022   // We have all of the arguments setup at this point. We must not touch any register
2023   // argument registers at this point (what if we save/restore them there are no oop?
2024 
2025   {
2026     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2027     // protect the args we've loaded
2028     save_args(masm, total_c_args, c_arg, out_regs);
2029     __ mov_metadata(c_rarg1, method());
2030     __ call_VM_leaf(
2031       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2032       r15_thread, c_rarg1);
2033     restore_args(masm, total_c_args, c_arg, out_regs);
2034   }
2035 
2036   // RedefineClasses() tracing support for obsolete method entry
2037   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2038     // protect the args we've loaded
2039     save_args(masm, total_c_args, c_arg, out_regs);
2040     __ mov_metadata(c_rarg1, method());
2041     __ call_VM_leaf(
2042       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2043       r15_thread, c_rarg1);
2044     restore_args(masm, total_c_args, c_arg, out_regs);
2045   }
2046 
2047   // Lock a synchronized method
2048 
2049   // Register definitions used by locking and unlocking
2050 
2051   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2052   const Register obj_reg  = rbx;  // Will contain the oop
2053   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2054   const Register old_hdr  = r13;  // value of old header at unlock time
2055 
2056   Label slow_path_lock;
2057   Label lock_done;
2058 
2059   if (method->is_synchronized()) {
2060     assert(!is_critical_native, "unhandled");
2061 
2062 
2063     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2064 
2065     // Get the handle (the 2nd argument)
2066     __ mov(oop_handle_reg, c_rarg1);
2067 
2068     // Get address of the box
2069 
2070     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2071 
2072     // Load the oop from the handle
2073     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2074 
2075     if (UseFastLocking) {
2076       // Load object header
2077       __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2078       __ fast_lock_impl(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2079     } else {
2080       if (UseBiasedLocking) {
2081         __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, rscratch2, false, lock_done, &slow_path_lock);
2082       }
2083 
2084       // Load immediate 1 into swap_reg %rax
2085       __ movl(swap_reg, 1);
2086 
2087       // Load (object->mark() | 1) into swap_reg %rax
2088       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2089 
2090       // Save (object->mark() | 1) into BasicLock's displaced header
2091       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2092 
2093       // src -> dest iff dest == rax else rax <- dest
2094       __ lock();
2095       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2096       __ jcc(Assembler::equal, lock_done);
2097 
2098       // Hmm should this move to the slow path code area???
2099 
2100       // Test if the oopMark is an obvious stack pointer, i.e.,
2101       //  1) (mark & 3) == 0, and
2102       //  2) rsp <= mark < mark + os::pagesize()
2103       // These 3 tests can be done by evaluating the following
2104       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2105       // assuming both stack pointer and pagesize have their
2106       // least significant 2 bits clear.
2107       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2108 
2109       __ subptr(swap_reg, rsp);
2110       __ andptr(swap_reg, 3 - os::vm_page_size());
2111 
2112       // Save the test result, for recursive case, the result is zero
2113       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2114       __ jcc(Assembler::notEqual, slow_path_lock);
2115     }
2116 
2117     // Slow path will re-enter here
2118 
2119     __ bind(lock_done);
2120   }
2121 
2122   // Finally just about ready to make the JNI call
2123 
2124   // get JNIEnv* which is first argument to native
2125   if (!is_critical_native) {
2126     __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2127 
2128     // Now set thread in native
2129     __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2130   }
2131 
2132   __ call(RuntimeAddress(native_func));
2133 
2134   // Verify or restore cpu control state after JNI call
2135   __ restore_cpu_control_state_after_jni();
2136 
2137   // Unpack native results.
2138   switch (ret_type) {
2139   case T_BOOLEAN: __ c2bool(rax);            break;
2140   case T_CHAR   : __ movzwl(rax, rax);      break;
2141   case T_BYTE   : __ sign_extend_byte (rax); break;
2142   case T_SHORT  : __ sign_extend_short(rax); break;
2143   case T_INT    : /* nothing to do */        break;
2144   case T_DOUBLE :
2145   case T_FLOAT  :
2146     // Result is in xmm0 we'll save as needed
2147     break;
2148   case T_ARRAY:                 // Really a handle
2149   case T_OBJECT:                // Really a handle
2150       break; // can't de-handlize until after safepoint check
2151   case T_VOID: break;
2152   case T_LONG: break;
2153   default       : ShouldNotReachHere();
2154   }
2155 
2156   Label after_transition;
2157 
2158   // If this is a critical native, check for a safepoint or suspend request after the call.
2159   // If a safepoint is needed, transition to native, then to native_trans to handle
2160   // safepoints like the native methods that are not critical natives.
2161   if (is_critical_native) {
2162     Label needs_safepoint;
2163     __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */);
2164     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2165     __ jcc(Assembler::equal, after_transition);
2166     __ bind(needs_safepoint);
2167   }
2168 
2169   // Switch thread to "native transition" state before reading the synchronization state.
2170   // This additional state is necessary because reading and testing the synchronization
2171   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2172   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2173   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2174   //     Thread A is resumed to finish this native method, but doesn't block here since it
2175   //     didn't see any synchronization is progress, and escapes.
2176   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2177 
2178   // Force this write out before the read below
2179   __ membar(Assembler::Membar_mask_bits(
2180               Assembler::LoadLoad | Assembler::LoadStore |
2181               Assembler::StoreLoad | Assembler::StoreStore));
2182 
2183   // check for safepoint operation in progress and/or pending suspend requests
2184   {
2185     Label Continue;
2186     Label slow_path;
2187 
2188     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2189 
2190     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2191     __ jcc(Assembler::equal, Continue);
2192     __ bind(slow_path);
2193 
2194     // Don't use call_VM as it will see a possible pending exception and forward it
2195     // and never return here preventing us from clearing _last_native_pc down below.
2196     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2197     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2198     // by hand.
2199     //
2200     __ vzeroupper();
2201     save_native_result(masm, ret_type, stack_slots);
2202     __ mov(c_rarg0, r15_thread);
2203     __ mov(r12, rsp); // remember sp
2204     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2205     __ andptr(rsp, -16); // align stack as required by ABI
2206     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2207     __ mov(rsp, r12); // restore sp
2208     __ reinit_heapbase();
2209     // Restore any method result value
2210     restore_native_result(masm, ret_type, stack_slots);
2211     __ bind(Continue);
2212   }
2213 
2214   // change thread state
2215   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2216   __ bind(after_transition);
2217 
2218   Label reguard;
2219   Label reguard_done;
2220   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2221   __ jcc(Assembler::equal, reguard);
2222   __ bind(reguard_done);
2223 
2224   // native result if any is live
2225 
2226   // Unlock
2227   Label unlock_done;
2228   Label slow_path_unlock;
2229   if (method->is_synchronized()) {
2230 
2231     // Get locked oop from the handle we passed to jni
2232     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2233 
2234     Label done;
2235 
2236     if (UseBiasedLocking) {
2237       __ biased_locking_exit(obj_reg, old_hdr, done);
2238     }
2239 
2240     if (!UseFastLocking) {
2241       // Simple recursive lock?
2242 
2243       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2244       __ jcc(Assembler::equal, done);
2245     }
2246 
2247     // Must save rax if if it is live now because cmpxchg must use it
2248     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2249       save_native_result(masm, ret_type, stack_slots);
2250     }
2251 
2252     if (UseFastLocking) {
2253       __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2254       __ andptr(swap_reg, ~(int32_t)markWord::lock_mask_in_place);
2255       __ fast_unlock_impl(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2256     } else {
2257       // get address of the stack lock
2258       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2259       //  get old displaced header
2260       __ movptr(old_hdr, Address(rax, 0));
2261 
2262       // Atomic swap old header if oop still contains the stack lock
2263       __ lock();
2264       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2265       __ jcc(Assembler::notEqual, slow_path_unlock);
2266     }
2267 
2268     // slow path re-enters here
2269     __ bind(unlock_done);
2270     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2271       restore_native_result(masm, ret_type, stack_slots);
2272     }
2273 
2274     __ bind(done);
2275 
2276   }
2277   {
2278     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2279     save_native_result(masm, ret_type, stack_slots);
2280     __ mov_metadata(c_rarg1, method());
2281     __ call_VM_leaf(
2282          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2283          r15_thread, c_rarg1);
2284     restore_native_result(masm, ret_type, stack_slots);
2285   }
2286 
2287   __ reset_last_Java_frame(false);
2288 
2289   // Unbox oop result, e.g. JNIHandles::resolve value.
2290   if (is_reference_type(ret_type)) {
2291     __ resolve_jobject(rax /* value */,
2292                        r15_thread /* thread */,
2293                        rcx /* tmp */);
2294   }
2295 
2296   if (CheckJNICalls) {
2297     // clear_pending_jni_exception_check
2298     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2299   }
2300 
2301   if (!is_critical_native) {
2302     // reset handle block
2303     __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2304     __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2305   }
2306 
2307   // pop our frame
2308 
2309   __ leave();
2310 
2311   if (!is_critical_native) {
2312     // Any exception pending?
2313     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2314     __ jcc(Assembler::notEqual, exception_pending);
2315   }
2316 
2317   // Return
2318 
2319   __ ret(0);
2320 
2321   // Unexpected paths are out of line and go here
2322 
2323   if (!is_critical_native) {
2324     // forward the exception
2325     __ bind(exception_pending);
2326 
2327     // and forward the exception
2328     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2329   }
2330 
2331   // Slow path locking & unlocking
2332   if (method->is_synchronized()) {
2333 
2334     // BEGIN Slow path lock
2335     __ bind(slow_path_lock);
2336 
2337     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2338     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2339 
2340     // protect the args we've loaded
2341     save_args(masm, total_c_args, c_arg, out_regs);
2342 
2343     __ mov(c_rarg0, obj_reg);
2344     __ mov(c_rarg1, lock_reg);
2345     __ mov(c_rarg2, r15_thread);
2346 
2347     // Not a leaf but we have last_Java_frame setup as we want
2348     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2349     restore_args(masm, total_c_args, c_arg, out_regs);
2350 
2351 #ifdef ASSERT
2352     { Label L;
2353     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2354     __ jcc(Assembler::equal, L);
2355     __ stop("no pending exception allowed on exit from monitorenter");
2356     __ bind(L);
2357     }
2358 #endif
2359     __ jmp(lock_done);
2360 
2361     // END Slow path lock
2362 
2363     // BEGIN Slow path unlock
2364     __ bind(slow_path_unlock);
2365 
2366     // If we haven't already saved the native result we must save it now as xmm registers
2367     // are still exposed.
2368     __ vzeroupper();
2369     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2370       save_native_result(masm, ret_type, stack_slots);
2371     }
2372 
2373     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2374 
2375     __ mov(c_rarg0, obj_reg);
2376     __ mov(c_rarg2, r15_thread);
2377     __ mov(r12, rsp); // remember sp
2378     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2379     __ andptr(rsp, -16); // align stack as required by ABI
2380 
2381     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2382     // NOTE that obj_reg == rbx currently
2383     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2384     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2385 
2386     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2387     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2388     __ mov(rsp, r12); // restore sp
2389     __ reinit_heapbase();
2390 #ifdef ASSERT
2391     {
2392       Label L;
2393       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2394       __ jcc(Assembler::equal, L);
2395       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2396       __ bind(L);
2397     }
2398 #endif /* ASSERT */
2399 
2400     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2401 
2402     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2403       restore_native_result(masm, ret_type, stack_slots);
2404     }
2405     __ jmp(unlock_done);
2406 
2407     // END Slow path unlock
2408 
2409   } // synchronized
2410 
2411   // SLOW PATH Reguard the stack if needed
2412 
2413   __ bind(reguard);
2414   __ vzeroupper();
2415   save_native_result(masm, ret_type, stack_slots);
2416   __ mov(r12, rsp); // remember sp
2417   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2418   __ andptr(rsp, -16); // align stack as required by ABI
2419   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2420   __ mov(rsp, r12); // restore sp
2421   __ reinit_heapbase();
2422   restore_native_result(masm, ret_type, stack_slots);
2423   // and continue
2424   __ jmp(reguard_done);
2425 
2426 
2427 
2428   __ flush();
2429 
2430   nmethod *nm = nmethod::new_native_nmethod(method,
2431                                             compile_id,
2432                                             masm->code(),
2433                                             vep_offset,
2434                                             frame_complete,
2435                                             stack_slots / VMRegImpl::slots_per_word,
2436                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2437                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2438                                             oop_maps);
2439 
2440   return nm;
2441 }
2442 
2443 // this function returns the adjust size (in number of words) to a c2i adapter
2444 // activation for use during deoptimization
2445 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2446   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2447 }
2448 
2449 
2450 uint SharedRuntime::out_preserve_stack_slots() {
2451   return 0;
2452 }
2453 
2454 
2455 // Number of stack slots between incoming argument block and the start of
2456 // a new frame.  The PROLOG must add this many slots to the stack.  The
2457 // EPILOG must remove this many slots.  amd64 needs two slots for
2458 // return address.
2459 uint SharedRuntime::in_preserve_stack_slots() {
2460   return 4 + 2 * VerifyStackAtCalls;
2461 }
2462 
2463 //------------------------------generate_deopt_blob----------------------------
2464 void SharedRuntime::generate_deopt_blob() {
2465   // Allocate space for the code
2466   ResourceMark rm;
2467   // Setup code generation tools
2468   int pad = 0;
2469   if (UseAVX > 2) {
2470     pad += 1024;
2471   }
2472 #if INCLUDE_JVMCI
2473   if (EnableJVMCI) {
2474     pad += 512; // Increase the buffer size when compiling for JVMCI
2475   }
2476 #endif
2477   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2478   MacroAssembler* masm = new MacroAssembler(&buffer);
2479   int frame_size_in_words;
2480   OopMap* map = NULL;
2481   OopMapSet *oop_maps = new OopMapSet();
2482 
2483   // -------------
2484   // This code enters when returning to a de-optimized nmethod.  A return
2485   // address has been pushed on the the stack, and return values are in
2486   // registers.
2487   // If we are doing a normal deopt then we were called from the patched
2488   // nmethod from the point we returned to the nmethod. So the return
2489   // address on the stack is wrong by NativeCall::instruction_size
2490   // We will adjust the value so it looks like we have the original return
2491   // address on the stack (like when we eagerly deoptimized).
2492   // In the case of an exception pending when deoptimizing, we enter
2493   // with a return address on the stack that points after the call we patched
2494   // into the exception handler. We have the following register state from,
2495   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2496   //    rax: exception oop
2497   //    rbx: exception handler
2498   //    rdx: throwing pc
2499   // So in this case we simply jam rdx into the useless return address and
2500   // the stack looks just like we want.
2501   //
2502   // At this point we need to de-opt.  We save the argument return
2503   // registers.  We call the first C routine, fetch_unroll_info().  This
2504   // routine captures the return values and returns a structure which
2505   // describes the current frame size and the sizes of all replacement frames.
2506   // The current frame is compiled code and may contain many inlined
2507   // functions, each with their own JVM state.  We pop the current frame, then
2508   // push all the new frames.  Then we call the C routine unpack_frames() to
2509   // populate these frames.  Finally unpack_frames() returns us the new target
2510   // address.  Notice that callee-save registers are BLOWN here; they have
2511   // already been captured in the vframeArray at the time the return PC was
2512   // patched.
2513   address start = __ pc();
2514   Label cont;
2515 
2516   // Prolog for non exception case!
2517 
2518   // Save everything in sight.
2519   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2520 
2521   // Normal deoptimization.  Save exec mode for unpack_frames.
2522   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2523   __ jmp(cont);
2524 
2525   int reexecute_offset = __ pc() - start;
2526 #if INCLUDE_JVMCI && !defined(COMPILER1)
2527   if (EnableJVMCI && UseJVMCICompiler) {
2528     // JVMCI does not use this kind of deoptimization
2529     __ should_not_reach_here();
2530   }
2531 #endif
2532 
2533   // Reexecute case
2534   // return address is the pc describes what bci to do re-execute at
2535 
2536   // No need to update map as each call to save_live_registers will produce identical oopmap
2537   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2538 
2539   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2540   __ jmp(cont);
2541 
2542 #if INCLUDE_JVMCI
2543   Label after_fetch_unroll_info_call;
2544   int implicit_exception_uncommon_trap_offset = 0;
2545   int uncommon_trap_offset = 0;
2546 
2547   if (EnableJVMCI) {
2548     implicit_exception_uncommon_trap_offset = __ pc() - start;
2549 
2550     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2551     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2552 
2553     uncommon_trap_offset = __ pc() - start;
2554 
2555     // Save everything in sight.
2556     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2557     // fetch_unroll_info needs to call last_java_frame()
2558     __ set_last_Java_frame(noreg, noreg, NULL);
2559 
2560     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2561     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2562 
2563     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2564     __ mov(c_rarg0, r15_thread);
2565     __ movl(c_rarg2, r14); // exec mode
2566     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2567     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2568 
2569     __ reset_last_Java_frame(false);
2570 
2571     __ jmp(after_fetch_unroll_info_call);
2572   } // EnableJVMCI
2573 #endif // INCLUDE_JVMCI
2574 
2575   int exception_offset = __ pc() - start;
2576 
2577   // Prolog for exception case
2578 
2579   // all registers are dead at this entry point, except for rax, and
2580   // rdx which contain the exception oop and exception pc
2581   // respectively.  Set them in TLS and fall thru to the
2582   // unpack_with_exception_in_tls entry point.
2583 
2584   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2585   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2586 
2587   int exception_in_tls_offset = __ pc() - start;
2588 
2589   // new implementation because exception oop is now passed in JavaThread
2590 
2591   // Prolog for exception case
2592   // All registers must be preserved because they might be used by LinearScan
2593   // Exceptiop oop and throwing PC are passed in JavaThread
2594   // tos: stack at point of call to method that threw the exception (i.e. only
2595   // args are on the stack, no return address)
2596 
2597   // make room on stack for the return address
2598   // It will be patched later with the throwing pc. The correct value is not
2599   // available now because loading it from memory would destroy registers.
2600   __ push(0);
2601 
2602   // Save everything in sight.
2603   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2604 
2605   // Now it is safe to overwrite any register
2606 
2607   // Deopt during an exception.  Save exec mode for unpack_frames.
2608   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2609 
2610   // load throwing pc from JavaThread and patch it as the return address
2611   // of the current frame. Then clear the field in JavaThread
2612 
2613   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2614   __ movptr(Address(rbp, wordSize), rdx);
2615   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2616 
2617 #ifdef ASSERT
2618   // verify that there is really an exception oop in JavaThread
2619   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2620   __ verify_oop(rax);
2621 
2622   // verify that there is no pending exception
2623   Label no_pending_exception;
2624   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2625   __ testptr(rax, rax);
2626   __ jcc(Assembler::zero, no_pending_exception);
2627   __ stop("must not have pending exception here");
2628   __ bind(no_pending_exception);
2629 #endif
2630 
2631   __ bind(cont);
2632 
2633   // Call C code.  Need thread and this frame, but NOT official VM entry
2634   // crud.  We cannot block on this call, no GC can happen.
2635   //
2636   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2637 
2638   // fetch_unroll_info needs to call last_java_frame().
2639 
2640   __ set_last_Java_frame(noreg, noreg, NULL);
2641 #ifdef ASSERT
2642   { Label L;
2643     __ cmpptr(Address(r15_thread,
2644                     JavaThread::last_Java_fp_offset()),
2645             (int32_t)0);
2646     __ jcc(Assembler::equal, L);
2647     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2648     __ bind(L);
2649   }
2650 #endif // ASSERT
2651   __ mov(c_rarg0, r15_thread);
2652   __ movl(c_rarg1, r14); // exec_mode
2653   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2654 
2655   // Need to have an oopmap that tells fetch_unroll_info where to
2656   // find any register it might need.
2657   oop_maps->add_gc_map(__ pc() - start, map);
2658 
2659   __ reset_last_Java_frame(false);
2660 
2661 #if INCLUDE_JVMCI
2662   if (EnableJVMCI) {
2663     __ bind(after_fetch_unroll_info_call);
2664   }
2665 #endif
2666 
2667   // Load UnrollBlock* into rdi
2668   __ mov(rdi, rax);
2669 
2670   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2671    Label noException;
2672   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2673   __ jcc(Assembler::notEqual, noException);
2674   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2675   // QQQ this is useless it was NULL above
2676   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2677   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2678   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2679 
2680   __ verify_oop(rax);
2681 
2682   // Overwrite the result registers with the exception results.
2683   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2684   // I think this is useless
2685   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2686 
2687   __ bind(noException);
2688 
2689   // Only register save data is on the stack.
2690   // Now restore the result registers.  Everything else is either dead
2691   // or captured in the vframeArray.
2692   RegisterSaver::restore_result_registers(masm);
2693 
2694   // All of the register save area has been popped of the stack. Only the
2695   // return address remains.
2696 
2697   // Pop all the frames we must move/replace.
2698   //
2699   // Frame picture (youngest to oldest)
2700   // 1: self-frame (no frame link)
2701   // 2: deopting frame  (no frame link)
2702   // 3: caller of deopting frame (could be compiled/interpreted).
2703   //
2704   // Note: by leaving the return address of self-frame on the stack
2705   // and using the size of frame 2 to adjust the stack
2706   // when we are done the return to frame 3 will still be on the stack.
2707 
2708   // Pop deoptimized frame
2709   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2710   __ addptr(rsp, rcx);
2711 
2712   // rsp should be pointing at the return address to the caller (3)
2713 
2714   // Pick up the initial fp we should save
2715   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2716   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2717 
2718 #ifdef ASSERT
2719   // Compilers generate code that bang the stack by as much as the
2720   // interpreter would need. So this stack banging should never
2721   // trigger a fault. Verify that it does not on non product builds.
2722   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2723   __ bang_stack_size(rbx, rcx);
2724 #endif
2725 
2726   // Load address of array of frame pcs into rcx
2727   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2728 
2729   // Trash the old pc
2730   __ addptr(rsp, wordSize);
2731 
2732   // Load address of array of frame sizes into rsi
2733   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2734 
2735   // Load counter into rdx
2736   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2737 
2738   // Now adjust the caller's stack to make up for the extra locals
2739   // but record the original sp so that we can save it in the skeletal interpreter
2740   // frame and the stack walking of interpreter_sender will get the unextended sp
2741   // value and not the "real" sp value.
2742 
2743   const Register sender_sp = r8;
2744 
2745   __ mov(sender_sp, rsp);
2746   __ movl(rbx, Address(rdi,
2747                        Deoptimization::UnrollBlock::
2748                        caller_adjustment_offset_in_bytes()));
2749   __ subptr(rsp, rbx);
2750 
2751   // Push interpreter frames in a loop
2752   Label loop;
2753   __ bind(loop);
2754   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2755   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2756   __ pushptr(Address(rcx, 0));          // Save return address
2757   __ enter();                           // Save old & set new ebp
2758   __ subptr(rsp, rbx);                  // Prolog
2759   // This value is corrected by layout_activation_impl
2760   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2761   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2762   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2763   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2764   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2765   __ decrementl(rdx);                   // Decrement counter
2766   __ jcc(Assembler::notZero, loop);
2767   __ pushptr(Address(rcx, 0));          // Save final return address
2768 
2769   // Re-push self-frame
2770   __ enter();                           // Save old & set new ebp
2771 
2772   // Allocate a full sized register save area.
2773   // Return address and rbp are in place, so we allocate two less words.
2774   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2775 
2776   // Restore frame locals after moving the frame
2777   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2778   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2779 
2780   // Call C code.  Need thread but NOT official VM entry
2781   // crud.  We cannot block on this call, no GC can happen.  Call should
2782   // restore return values to their stack-slots with the new SP.
2783   //
2784   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2785 
2786   // Use rbp because the frames look interpreted now
2787   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2788   // Don't need the precise return PC here, just precise enough to point into this code blob.
2789   address the_pc = __ pc();
2790   __ set_last_Java_frame(noreg, rbp, the_pc);
2791 
2792   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2793   __ mov(c_rarg0, r15_thread);
2794   __ movl(c_rarg1, r14); // second arg: exec_mode
2795   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2796   // Revert SP alignment after call since we're going to do some SP relative addressing below
2797   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2798 
2799   // Set an oopmap for the call site
2800   // Use the same PC we used for the last java frame
2801   oop_maps->add_gc_map(the_pc - start,
2802                        new OopMap( frame_size_in_words, 0 ));
2803 
2804   // Clear fp AND pc
2805   __ reset_last_Java_frame(true);
2806 
2807   // Collect return values
2808   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2809   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2810   // I think this is useless (throwing pc?)
2811   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2812 
2813   // Pop self-frame.
2814   __ leave();                           // Epilog
2815 
2816   // Jump to interpreter
2817   __ ret(0);
2818 
2819   // Make sure all code is generated
2820   masm->flush();
2821 
2822   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2823   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2824 #if INCLUDE_JVMCI
2825   if (EnableJVMCI) {
2826     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2827     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2828   }
2829 #endif
2830 }
2831 
2832 #ifdef COMPILER2
2833 //------------------------------generate_uncommon_trap_blob--------------------
2834 void SharedRuntime::generate_uncommon_trap_blob() {
2835   // Allocate space for the code
2836   ResourceMark rm;
2837   // Setup code generation tools
2838   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2839   MacroAssembler* masm = new MacroAssembler(&buffer);
2840 
2841   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2842 
2843   address start = __ pc();
2844 
2845   if (UseRTMLocking) {
2846     // Abort RTM transaction before possible nmethod deoptimization.
2847     __ xabort(0);
2848   }
2849 
2850   // Push self-frame.  We get here with a return address on the
2851   // stack, so rsp is 8-byte aligned until we allocate our frame.
2852   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2853 
2854   // No callee saved registers. rbp is assumed implicitly saved
2855   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2856 
2857   // compiler left unloaded_class_index in j_rarg0 move to where the
2858   // runtime expects it.
2859   __ movl(c_rarg1, j_rarg0);
2860 
2861   __ set_last_Java_frame(noreg, noreg, NULL);
2862 
2863   // Call C code.  Need thread but NOT official VM entry
2864   // crud.  We cannot block on this call, no GC can happen.  Call should
2865   // capture callee-saved registers as well as return values.
2866   // Thread is in rdi already.
2867   //
2868   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2869 
2870   __ mov(c_rarg0, r15_thread);
2871   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2872   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2873 
2874   // Set an oopmap for the call site
2875   OopMapSet* oop_maps = new OopMapSet();
2876   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2877 
2878   // location of rbp is known implicitly by the frame sender code
2879 
2880   oop_maps->add_gc_map(__ pc() - start, map);
2881 
2882   __ reset_last_Java_frame(false);
2883 
2884   // Load UnrollBlock* into rdi
2885   __ mov(rdi, rax);
2886 
2887 #ifdef ASSERT
2888   { Label L;
2889     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2890             (int32_t)Deoptimization::Unpack_uncommon_trap);
2891     __ jcc(Assembler::equal, L);
2892     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2893     __ bind(L);
2894   }
2895 #endif
2896 
2897   // Pop all the frames we must move/replace.
2898   //
2899   // Frame picture (youngest to oldest)
2900   // 1: self-frame (no frame link)
2901   // 2: deopting frame  (no frame link)
2902   // 3: caller of deopting frame (could be compiled/interpreted).
2903 
2904   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2905   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2906 
2907   // Pop deoptimized frame (int)
2908   __ movl(rcx, Address(rdi,
2909                        Deoptimization::UnrollBlock::
2910                        size_of_deoptimized_frame_offset_in_bytes()));
2911   __ addptr(rsp, rcx);
2912 
2913   // rsp should be pointing at the return address to the caller (3)
2914 
2915   // Pick up the initial fp we should save
2916   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2917   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2918 
2919 #ifdef ASSERT
2920   // Compilers generate code that bang the stack by as much as the
2921   // interpreter would need. So this stack banging should never
2922   // trigger a fault. Verify that it does not on non product builds.
2923   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2924   __ bang_stack_size(rbx, rcx);
2925 #endif
2926 
2927   // Load address of array of frame pcs into rcx (address*)
2928   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2929 
2930   // Trash the return pc
2931   __ addptr(rsp, wordSize);
2932 
2933   // Load address of array of frame sizes into rsi (intptr_t*)
2934   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2935 
2936   // Counter
2937   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2938 
2939   // Now adjust the caller's stack to make up for the extra locals but
2940   // record the original sp so that we can save it in the skeletal
2941   // interpreter frame and the stack walking of interpreter_sender
2942   // will get the unextended sp value and not the "real" sp value.
2943 
2944   const Register sender_sp = r8;
2945 
2946   __ mov(sender_sp, rsp);
2947   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2948   __ subptr(rsp, rbx);
2949 
2950   // Push interpreter frames in a loop
2951   Label loop;
2952   __ bind(loop);
2953   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2954   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2955   __ pushptr(Address(rcx, 0));     // Save return address
2956   __ enter();                      // Save old & set new rbp
2957   __ subptr(rsp, rbx);             // Prolog
2958   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2959             sender_sp);            // Make it walkable
2960   // This value is corrected by layout_activation_impl
2961   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2962   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
2963   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
2964   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
2965   __ decrementl(rdx);              // Decrement counter
2966   __ jcc(Assembler::notZero, loop);
2967   __ pushptr(Address(rcx, 0));     // Save final return address
2968 
2969   // Re-push self-frame
2970   __ enter();                 // Save old & set new rbp
2971   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2972                               // Prolog
2973 
2974   // Use rbp because the frames look interpreted now
2975   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2976   // Don't need the precise return PC here, just precise enough to point into this code blob.
2977   address the_pc = __ pc();
2978   __ set_last_Java_frame(noreg, rbp, the_pc);
2979 
2980   // Call C code.  Need thread but NOT official VM entry
2981   // crud.  We cannot block on this call, no GC can happen.  Call should
2982   // restore return values to their stack-slots with the new SP.
2983   // Thread is in rdi already.
2984   //
2985   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2986 
2987   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2988   __ mov(c_rarg0, r15_thread);
2989   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2990   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2991 
2992   // Set an oopmap for the call site
2993   // Use the same PC we used for the last java frame
2994   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2995 
2996   // Clear fp AND pc
2997   __ reset_last_Java_frame(true);
2998 
2999   // Pop self-frame.
3000   __ leave();                 // Epilog
3001 
3002   // Jump to interpreter
3003   __ ret(0);
3004 
3005   // Make sure all code is generated
3006   masm->flush();
3007 
3008   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3009                                                  SimpleRuntimeFrame::framesize >> 1);
3010 }
3011 #endif // COMPILER2
3012 
3013 //------------------------------generate_handler_blob------
3014 //
3015 // Generate a special Compile2Runtime blob that saves all registers,
3016 // and setup oopmap.
3017 //
3018 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3019   assert(StubRoutines::forward_exception_entry() != NULL,
3020          "must be generated before");
3021 
3022   ResourceMark rm;
3023   OopMapSet *oop_maps = new OopMapSet();
3024   OopMap* map;
3025 
3026   // Allocate space for the code.  Setup code generation tools.
3027   CodeBuffer buffer("handler_blob", 2048, 1024);
3028   MacroAssembler* masm = new MacroAssembler(&buffer);
3029 
3030   address start   = __ pc();
3031   address call_pc = NULL;
3032   int frame_size_in_words;
3033   bool cause_return = (poll_type == POLL_AT_RETURN);
3034   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3035 
3036   if (UseRTMLocking) {
3037     // Abort RTM transaction before calling runtime
3038     // because critical section will be large and will be
3039     // aborted anyway. Also nmethod could be deoptimized.
3040     __ xabort(0);
3041   }
3042 
3043   // Make room for return address (or push it again)
3044   if (!cause_return) {
3045     __ push(rbx);
3046   }
3047 
3048   // Save registers, fpu state, and flags
3049   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3050 
3051   // The following is basically a call_VM.  However, we need the precise
3052   // address of the call in order to generate an oopmap. Hence, we do all the
3053   // work outselves.
3054 
3055   __ set_last_Java_frame(noreg, noreg, NULL);
3056 
3057   // The return address must always be correct so that frame constructor never
3058   // sees an invalid pc.
3059 
3060   if (!cause_return) {
3061     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3062     // Additionally, rbx is a callee saved register and we can look at it later to determine
3063     // if someone changed the return address for us!
3064     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3065     __ movptr(Address(rbp, wordSize), rbx);
3066   }
3067 
3068   // Do the call
3069   __ mov(c_rarg0, r15_thread);
3070   __ call(RuntimeAddress(call_ptr));
3071 
3072   // Set an oopmap for the call site.  This oopmap will map all
3073   // oop-registers and debug-info registers as callee-saved.  This
3074   // will allow deoptimization at this safepoint to find all possible
3075   // debug-info recordings, as well as let GC find all oops.
3076 
3077   oop_maps->add_gc_map( __ pc() - start, map);
3078 
3079   Label noException;
3080 
3081   __ reset_last_Java_frame(false);
3082 
3083   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3084   __ jcc(Assembler::equal, noException);
3085 
3086   // Exception pending
3087 
3088   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3089 
3090   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3091 
3092   // No exception case
3093   __ bind(noException);
3094 
3095   Label no_adjust;
3096 #ifdef ASSERT
3097   Label bail;
3098 #endif
3099   if (!cause_return) {
3100     Label no_prefix, not_special;
3101 
3102     // If our stashed return pc was modified by the runtime we avoid touching it
3103     __ cmpptr(rbx, Address(rbp, wordSize));
3104     __ jccb(Assembler::notEqual, no_adjust);
3105 
3106     // Skip over the poll instruction.
3107     // See NativeInstruction::is_safepoint_poll()
3108     // Possible encodings:
3109     //      85 00       test   %eax,(%rax)
3110     //      85 01       test   %eax,(%rcx)
3111     //      85 02       test   %eax,(%rdx)
3112     //      85 03       test   %eax,(%rbx)
3113     //      85 06       test   %eax,(%rsi)
3114     //      85 07       test   %eax,(%rdi)
3115     //
3116     //   41 85 00       test   %eax,(%r8)
3117     //   41 85 01       test   %eax,(%r9)
3118     //   41 85 02       test   %eax,(%r10)
3119     //   41 85 03       test   %eax,(%r11)
3120     //   41 85 06       test   %eax,(%r14)
3121     //   41 85 07       test   %eax,(%r15)
3122     //
3123     //      85 04 24    test   %eax,(%rsp)
3124     //   41 85 04 24    test   %eax,(%r12)
3125     //      85 45 00    test   %eax,0x0(%rbp)
3126     //   41 85 45 00    test   %eax,0x0(%r13)
3127 
3128     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3129     __ jcc(Assembler::notEqual, no_prefix);
3130     __ addptr(rbx, 1);
3131     __ bind(no_prefix);
3132 #ifdef ASSERT
3133     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3134 #endif
3135     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3136     // r12/rsp 0x04
3137     // r13/rbp 0x05
3138     __ movzbq(rcx, Address(rbx, 1));
3139     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3140     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3141     __ cmpptr(rcx, 1);
3142     __ jcc(Assembler::above, not_special);
3143     __ addptr(rbx, 1);
3144     __ bind(not_special);
3145 #ifdef ASSERT
3146     // Verify the correct encoding of the poll we're about to skip.
3147     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3148     __ jcc(Assembler::notEqual, bail);
3149     // Mask out the modrm bits
3150     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3151     // rax encodes to 0, so if the bits are nonzero it's incorrect
3152     __ jcc(Assembler::notZero, bail);
3153 #endif
3154     // Adjust return pc forward to step over the safepoint poll instruction
3155     __ addptr(rbx, 2);
3156     __ movptr(Address(rbp, wordSize), rbx);
3157   }
3158 
3159   __ bind(no_adjust);
3160   // Normal exit, restore registers and exit.
3161   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3162   __ ret(0);
3163 
3164 #ifdef ASSERT
3165   __ bind(bail);
3166   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3167 #endif
3168 
3169   // Make sure all code is generated
3170   masm->flush();
3171 
3172   // Fill-out other meta info
3173   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3174 }
3175 
3176 //
3177 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3178 //
3179 // Generate a stub that calls into vm to find out the proper destination
3180 // of a java call. All the argument registers are live at this point
3181 // but since this is generic code we don't know what they are and the caller
3182 // must do any gc of the args.
3183 //
3184 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3185   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3186 
3187   // allocate space for the code
3188   ResourceMark rm;
3189 
3190   CodeBuffer buffer(name, 1000, 512);
3191   MacroAssembler* masm                = new MacroAssembler(&buffer);
3192 
3193   int frame_size_in_words;
3194 
3195   OopMapSet *oop_maps = new OopMapSet();
3196   OopMap* map = NULL;
3197 
3198   int start = __ offset();
3199 
3200   // No need to save vector registers since they are caller-saved anyway.
3201   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3202 
3203   int frame_complete = __ offset();
3204 
3205   __ set_last_Java_frame(noreg, noreg, NULL);
3206 
3207   __ mov(c_rarg0, r15_thread);
3208 
3209   __ call(RuntimeAddress(destination));
3210 
3211 
3212   // Set an oopmap for the call site.
3213   // We need this not only for callee-saved registers, but also for volatile
3214   // registers that the compiler might be keeping live across a safepoint.
3215 
3216   oop_maps->add_gc_map( __ offset() - start, map);
3217 
3218   // rax contains the address we are going to jump to assuming no exception got installed
3219 
3220   // clear last_Java_sp
3221   __ reset_last_Java_frame(false);
3222   // check for pending exceptions
3223   Label pending;
3224   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3225   __ jcc(Assembler::notEqual, pending);
3226 
3227   // get the returned Method*
3228   __ get_vm_result_2(rbx, r15_thread);
3229   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3230 
3231   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3232 
3233   RegisterSaver::restore_live_registers(masm);
3234 
3235   // We are back the the original state on entry and ready to go.
3236 
3237   __ jmp(rax);
3238 
3239   // Pending exception after the safepoint
3240 
3241   __ bind(pending);
3242 
3243   RegisterSaver::restore_live_registers(masm);
3244 
3245   // exception pending => remove activation and forward to exception handler
3246 
3247   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3248 
3249   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3250   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3251 
3252   // -------------
3253   // make sure all code is generated
3254   masm->flush();
3255 
3256   // return the  blob
3257   // frame_size_words or bytes??
3258   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3259 }
3260 
3261 #ifdef COMPILER2
3262 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3263 
3264 class NativeInvokerGenerator : public StubCodeGenerator {
3265   address _call_target;
3266   int _shadow_space_bytes;
3267 
3268   const GrowableArray<VMReg>& _input_registers;
3269   const GrowableArray<VMReg>& _output_registers;
3270 
3271   int _frame_complete;
3272   int _framesize;
3273   OopMapSet* _oop_maps;
3274 public:
3275   NativeInvokerGenerator(CodeBuffer* buffer,
3276                          address call_target,
3277                          int shadow_space_bytes,
3278                          const GrowableArray<VMReg>& input_registers,
3279                          const GrowableArray<VMReg>& output_registers)
3280    : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3281      _call_target(call_target),
3282      _shadow_space_bytes(shadow_space_bytes),
3283      _input_registers(input_registers),
3284      _output_registers(output_registers),
3285      _frame_complete(0),
3286      _framesize(0),
3287      _oop_maps(NULL) {
3288     assert(_output_registers.length() <= 1
3289            || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3290 
3291   }
3292 
3293   void generate();
3294 
3295   int spill_size_in_bytes() const {
3296     if (_output_registers.length() == 0) {
3297       return 0;
3298     }
3299     VMReg reg = _output_registers.at(0);
3300     assert(reg->is_reg(), "must be a register");
3301     if (reg->is_Register()) {
3302       return 8;
3303     } else if (reg->is_XMMRegister()) {
3304       if (UseAVX >= 3) {
3305         return 64;
3306       } else if (UseAVX >= 1) {
3307         return 32;
3308       } else {
3309         return 16;
3310       }
3311     } else {
3312       ShouldNotReachHere();
3313     }
3314     return 0;
3315   }
3316 
3317   void spill_out_registers() {
3318     if (_output_registers.length() == 0) {
3319       return;
3320     }
3321     VMReg reg = _output_registers.at(0);
3322     assert(reg->is_reg(), "must be a register");
3323     MacroAssembler* masm = _masm;
3324     if (reg->is_Register()) {
3325       __ movptr(Address(rsp, 0), reg->as_Register());
3326     } else if (reg->is_XMMRegister()) {
3327       if (UseAVX >= 3) {
3328         __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3329       } else if (UseAVX >= 1) {
3330         __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3331       } else {
3332         __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3333       }
3334     } else {
3335       ShouldNotReachHere();
3336     }
3337   }
3338 
3339   void fill_out_registers() {
3340     if (_output_registers.length() == 0) {
3341       return;
3342     }
3343     VMReg reg = _output_registers.at(0);
3344     assert(reg->is_reg(), "must be a register");
3345     MacroAssembler* masm = _masm;
3346     if (reg->is_Register()) {
3347       __ movptr(reg->as_Register(), Address(rsp, 0));
3348     } else if (reg->is_XMMRegister()) {
3349       if (UseAVX >= 3) {
3350         __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3351       } else if (UseAVX >= 1) {
3352         __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3353       } else {
3354         __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3355       }
3356     } else {
3357       ShouldNotReachHere();
3358     }
3359   }
3360 
3361   int frame_complete() const {
3362     return _frame_complete;
3363   }
3364 
3365   int framesize() const {
3366     return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3367   }
3368 
3369   OopMapSet* oop_maps() const {
3370     return _oop_maps;
3371   }
3372 
3373 private:
3374 #ifdef ASSERT
3375 bool target_uses_register(VMReg reg) {
3376   return _input_registers.contains(reg) || _output_registers.contains(reg);
3377 }
3378 #endif
3379 };
3380 
3381 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3382                                                 int shadow_space_bytes,
3383                                                 const GrowableArray<VMReg>& input_registers,
3384                                                 const GrowableArray<VMReg>& output_registers) {
3385   int locs_size  = 64;
3386   CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3387   NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3388   g.generate();
3389   code.log_section_sizes("nep_invoker_blob");
3390 
3391   RuntimeStub* stub =
3392     RuntimeStub::new_runtime_stub("nep_invoker_blob",
3393                                   &code,
3394                                   g.frame_complete(),
3395                                   g.framesize(),
3396                                   g.oop_maps(), false);
3397   return stub;
3398 }
3399 
3400 void NativeInvokerGenerator::generate() {
3401   assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3402 
3403   enum layout {
3404     rbp_off,
3405     rbp_off2,
3406     return_off,
3407     return_off2,
3408     framesize // inclusive of return address
3409   };
3410 
3411   _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3412   assert(is_even(_framesize/2), "sp not 16-byte aligned");
3413 
3414   _oop_maps  = new OopMapSet();
3415   MacroAssembler* masm = _masm;
3416 
3417   address start = __ pc();
3418 
3419   __ enter();
3420 
3421   // return address and rbp are already in place
3422   __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3423 
3424   _frame_complete = __ pc() - start;
3425 
3426   address the_pc = __ pc();
3427 
3428   __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3429   OopMap* map = new OopMap(_framesize, 0);
3430   _oop_maps->add_gc_map(the_pc - start, map);
3431 
3432   // State transition
3433   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3434 
3435   __ call(RuntimeAddress(_call_target));
3436 
3437   __ restore_cpu_control_state_after_jni();
3438 
3439   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3440 
3441   // Force this write out before the read below
3442   __ membar(Assembler::Membar_mask_bits(
3443           Assembler::LoadLoad | Assembler::LoadStore |
3444           Assembler::StoreLoad | Assembler::StoreStore));
3445 
3446   Label L_after_safepoint_poll;
3447   Label L_safepoint_poll_slow_path;
3448 
3449   __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3450   __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3451   __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3452 
3453   __ bind(L_after_safepoint_poll);
3454 
3455   // change thread state
3456   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3457 
3458   __ block_comment("reguard stack check");
3459   Label L_reguard;
3460   Label L_after_reguard;
3461   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3462   __ jcc(Assembler::equal, L_reguard);
3463   __ bind(L_after_reguard);
3464 
3465   __ reset_last_Java_frame(r15_thread, true);
3466 
3467   __ leave(); // required for proper stackwalking of RuntimeStub frame
3468   __ ret(0);
3469 
3470   //////////////////////////////////////////////////////////////////////////////
3471 
3472   __ block_comment("{ L_safepoint_poll_slow_path");
3473   __ bind(L_safepoint_poll_slow_path);
3474   __ vzeroupper();
3475 
3476   spill_out_registers();
3477 
3478   __ mov(c_rarg0, r15_thread);
3479   __ mov(r12, rsp); // remember sp
3480   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3481   __ andptr(rsp, -16); // align stack as required by ABI
3482   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3483   __ mov(rsp, r12); // restore sp
3484   __ reinit_heapbase();
3485 
3486   fill_out_registers();
3487 
3488   __ jmp(L_after_safepoint_poll);
3489   __ block_comment("} L_safepoint_poll_slow_path");
3490 
3491   //////////////////////////////////////////////////////////////////////////////
3492 
3493   __ block_comment("{ L_reguard");
3494   __ bind(L_reguard);
3495   __ vzeroupper();
3496 
3497   spill_out_registers();
3498 
3499   __ mov(r12, rsp); // remember sp
3500   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3501   __ andptr(rsp, -16); // align stack as required by ABI
3502   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3503   __ mov(rsp, r12); // restore sp
3504   __ reinit_heapbase();
3505 
3506   fill_out_registers();
3507 
3508   __ jmp(L_after_reguard);
3509 
3510   __ block_comment("} L_reguard");
3511 
3512   //////////////////////////////////////////////////////////////////////////////
3513 
3514   __ flush();
3515 }
3516 #endif // COMPILER2
3517 
3518 //------------------------------Montgomery multiplication------------------------
3519 //
3520 
3521 #ifndef _WINDOWS
3522 
3523 // Subtract 0:b from carry:a.  Return carry.
3524 static julong
3525 sub(julong a[], julong b[], julong carry, long len) {
3526   long long i = 0, cnt = len;
3527   julong tmp;
3528   asm volatile("clc; "
3529                "0: ; "
3530                "mov (%[b], %[i], 8), %[tmp]; "
3531                "sbb %[tmp], (%[a], %[i], 8); "
3532                "inc %[i]; dec %[cnt]; "
3533                "jne 0b; "
3534                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3535                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3536                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3537                : "memory");
3538   return tmp;
3539 }
3540 
3541 // Multiply (unsigned) Long A by Long B, accumulating the double-
3542 // length result into the accumulator formed of T0, T1, and T2.
3543 #define MACC(A, B, T0, T1, T2)                                  \
3544 do {                                                            \
3545   unsigned long hi, lo;                                         \
3546   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3547            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3548            : "r"(A), "a"(B) : "cc");                            \
3549  } while(0)
3550 
3551 // As above, but add twice the double-length result into the
3552 // accumulator.
3553 #define MACC2(A, B, T0, T1, T2)                                 \
3554 do {                                                            \
3555   unsigned long hi, lo;                                         \
3556   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3557            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3558            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3559            : "r"(A), "a"(B) : "cc");                            \
3560  } while(0)
3561 
3562 #else //_WINDOWS
3563 
3564 static julong
3565 sub(julong a[], julong b[], julong carry, long len) {
3566   long i;
3567   julong tmp;
3568   unsigned char c = 1;
3569   for (i = 0; i < len; i++) {
3570     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3571     a[i] = tmp;
3572   }
3573   c = _addcarry_u64(c, carry, ~0, &tmp);
3574   return tmp;
3575 }
3576 
3577 // Multiply (unsigned) Long A by Long B, accumulating the double-
3578 // length result into the accumulator formed of T0, T1, and T2.
3579 #define MACC(A, B, T0, T1, T2)                          \
3580 do {                                                    \
3581   julong hi, lo;                            \
3582   lo = _umul128(A, B, &hi);                             \
3583   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3584   c = _addcarry_u64(c, hi, T1, &T1);                    \
3585   _addcarry_u64(c, T2, 0, &T2);                         \
3586  } while(0)
3587 
3588 // As above, but add twice the double-length result into the
3589 // accumulator.
3590 #define MACC2(A, B, T0, T1, T2)                         \
3591 do {                                                    \
3592   julong hi, lo;                            \
3593   lo = _umul128(A, B, &hi);                             \
3594   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3595   c = _addcarry_u64(c, hi, T1, &T1);                    \
3596   _addcarry_u64(c, T2, 0, &T2);                         \
3597   c = _addcarry_u64(0, lo, T0, &T0);                    \
3598   c = _addcarry_u64(c, hi, T1, &T1);                    \
3599   _addcarry_u64(c, T2, 0, &T2);                         \
3600  } while(0)
3601 
3602 #endif //_WINDOWS
3603 
3604 // Fast Montgomery multiplication.  The derivation of the algorithm is
3605 // in  A Cryptographic Library for the Motorola DSP56000,
3606 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3607 
3608 static void NOINLINE
3609 montgomery_multiply(julong a[], julong b[], julong n[],
3610                     julong m[], julong inv, int len) {
3611   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3612   int i;
3613 
3614   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3615 
3616   for (i = 0; i < len; i++) {
3617     int j;
3618     for (j = 0; j < i; j++) {
3619       MACC(a[j], b[i-j], t0, t1, t2);
3620       MACC(m[j], n[i-j], t0, t1, t2);
3621     }
3622     MACC(a[i], b[0], t0, t1, t2);
3623     m[i] = t0 * inv;
3624     MACC(m[i], n[0], t0, t1, t2);
3625 
3626     assert(t0 == 0, "broken Montgomery multiply");
3627 
3628     t0 = t1; t1 = t2; t2 = 0;
3629   }
3630 
3631   for (i = len; i < 2*len; i++) {
3632     int j;
3633     for (j = i-len+1; j < len; j++) {
3634       MACC(a[j], b[i-j], t0, t1, t2);
3635       MACC(m[j], n[i-j], t0, t1, t2);
3636     }
3637     m[i-len] = t0;
3638     t0 = t1; t1 = t2; t2 = 0;
3639   }
3640 
3641   while (t0)
3642     t0 = sub(m, n, t0, len);
3643 }
3644 
3645 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3646 // multiplies so it should be up to 25% faster than Montgomery
3647 // multiplication.  However, its loop control is more complex and it
3648 // may actually run slower on some machines.
3649 
3650 static void NOINLINE
3651 montgomery_square(julong a[], julong n[],
3652                   julong m[], julong inv, int len) {
3653   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3654   int i;
3655 
3656   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3657 
3658   for (i = 0; i < len; i++) {
3659     int j;
3660     int end = (i+1)/2;
3661     for (j = 0; j < end; j++) {
3662       MACC2(a[j], a[i-j], t0, t1, t2);
3663       MACC(m[j], n[i-j], t0, t1, t2);
3664     }
3665     if ((i & 1) == 0) {
3666       MACC(a[j], a[j], t0, t1, t2);
3667     }
3668     for (; j < i; j++) {
3669       MACC(m[j], n[i-j], t0, t1, t2);
3670     }
3671     m[i] = t0 * inv;
3672     MACC(m[i], n[0], t0, t1, t2);
3673 
3674     assert(t0 == 0, "broken Montgomery square");
3675 
3676     t0 = t1; t1 = t2; t2 = 0;
3677   }
3678 
3679   for (i = len; i < 2*len; i++) {
3680     int start = i-len+1;
3681     int end = start + (len - start)/2;
3682     int j;
3683     for (j = start; j < end; j++) {
3684       MACC2(a[j], a[i-j], t0, t1, t2);
3685       MACC(m[j], n[i-j], t0, t1, t2);
3686     }
3687     if ((i & 1) == 0) {
3688       MACC(a[j], a[j], t0, t1, t2);
3689     }
3690     for (; j < len; j++) {
3691       MACC(m[j], n[i-j], t0, t1, t2);
3692     }
3693     m[i-len] = t0;
3694     t0 = t1; t1 = t2; t2 = 0;
3695   }
3696 
3697   while (t0)
3698     t0 = sub(m, n, t0, len);
3699 }
3700 
3701 // Swap words in a longword.
3702 static julong swap(julong x) {
3703   return (x << 32) | (x >> 32);
3704 }
3705 
3706 // Copy len longwords from s to d, word-swapping as we go.  The
3707 // destination array is reversed.
3708 static void reverse_words(julong *s, julong *d, int len) {
3709   d += len;
3710   while(len-- > 0) {
3711     d--;
3712     *d = swap(*s);
3713     s++;
3714   }
3715 }
3716 
3717 // The threshold at which squaring is advantageous was determined
3718 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3719 #define MONTGOMERY_SQUARING_THRESHOLD 64
3720 
3721 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3722                                         jint len, jlong inv,
3723                                         jint *m_ints) {
3724   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3725   int longwords = len/2;
3726 
3727   // Make very sure we don't use so much space that the stack might
3728   // overflow.  512 jints corresponds to an 16384-bit integer and
3729   // will use here a total of 8k bytes of stack space.
3730   int total_allocation = longwords * sizeof (julong) * 4;
3731   guarantee(total_allocation <= 8192, "must be");
3732   julong *scratch = (julong *)alloca(total_allocation);
3733 
3734   // Local scratch arrays
3735   julong
3736     *a = scratch + 0 * longwords,
3737     *b = scratch + 1 * longwords,
3738     *n = scratch + 2 * longwords,
3739     *m = scratch + 3 * longwords;
3740 
3741   reverse_words((julong *)a_ints, a, longwords);
3742   reverse_words((julong *)b_ints, b, longwords);
3743   reverse_words((julong *)n_ints, n, longwords);
3744 
3745   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3746 
3747   reverse_words(m, (julong *)m_ints, longwords);
3748 }
3749 
3750 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3751                                       jint len, jlong inv,
3752                                       jint *m_ints) {
3753   assert(len % 2 == 0, "array length in montgomery_square must be even");
3754   int longwords = len/2;
3755 
3756   // Make very sure we don't use so much space that the stack might
3757   // overflow.  512 jints corresponds to an 16384-bit integer and
3758   // will use here a total of 6k bytes of stack space.
3759   int total_allocation = longwords * sizeof (julong) * 3;
3760   guarantee(total_allocation <= 8192, "must be");
3761   julong *scratch = (julong *)alloca(total_allocation);
3762 
3763   // Local scratch arrays
3764   julong
3765     *a = scratch + 0 * longwords,
3766     *n = scratch + 1 * longwords,
3767     *m = scratch + 2 * longwords;
3768 
3769   reverse_words((julong *)a_ints, a, longwords);
3770   reverse_words((julong *)n_ints, n, longwords);
3771 
3772   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3773     ::montgomery_square(a, n, m, (julong)inv, longwords);
3774   } else {
3775     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3776   }
3777 
3778   reverse_words(m, (julong *)m_ints, longwords);
3779 }
3780 
3781 #ifdef COMPILER2
3782 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3783 //
3784 //------------------------------generate_exception_blob---------------------------
3785 // creates exception blob at the end
3786 // Using exception blob, this code is jumped from a compiled method.
3787 // (see emit_exception_handler in x86_64.ad file)
3788 //
3789 // Given an exception pc at a call we call into the runtime for the
3790 // handler in this method. This handler might merely restore state
3791 // (i.e. callee save registers) unwind the frame and jump to the
3792 // exception handler for the nmethod if there is no Java level handler
3793 // for the nmethod.
3794 //
3795 // This code is entered with a jmp.
3796 //
3797 // Arguments:
3798 //   rax: exception oop
3799 //   rdx: exception pc
3800 //
3801 // Results:
3802 //   rax: exception oop
3803 //   rdx: exception pc in caller or ???
3804 //   destination: exception handler of caller
3805 //
3806 // Note: the exception pc MUST be at a call (precise debug information)
3807 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3808 //
3809 
3810 void OptoRuntime::generate_exception_blob() {
3811   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3812   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3813   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3814 
3815   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3816 
3817   // Allocate space for the code
3818   ResourceMark rm;
3819   // Setup code generation tools
3820   CodeBuffer buffer("exception_blob", 2048, 1024);
3821   MacroAssembler* masm = new MacroAssembler(&buffer);
3822 
3823 
3824   address start = __ pc();
3825 
3826   // Exception pc is 'return address' for stack walker
3827   __ push(rdx);
3828   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3829 
3830   // Save callee-saved registers.  See x86_64.ad.
3831 
3832   // rbp is an implicitly saved callee saved register (i.e., the calling
3833   // convention will save/restore it in the prolog/epilog). Other than that
3834   // there are no callee save registers now that adapter frames are gone.
3835 
3836   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3837 
3838   // Store exception in Thread object. We cannot pass any arguments to the
3839   // handle_exception call, since we do not want to make any assumption
3840   // about the size of the frame where the exception happened in.
3841   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3842   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3843   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3844 
3845   // This call does all the hard work.  It checks if an exception handler
3846   // exists in the method.
3847   // If so, it returns the handler address.
3848   // If not, it prepares for stack-unwinding, restoring the callee-save
3849   // registers of the frame being removed.
3850   //
3851   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3852 
3853   // At a method handle call, the stack may not be properly aligned
3854   // when returning with an exception.
3855   address the_pc = __ pc();
3856   __ set_last_Java_frame(noreg, noreg, the_pc);
3857   __ mov(c_rarg0, r15_thread);
3858   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3859   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3860 
3861   // Set an oopmap for the call site.  This oopmap will only be used if we
3862   // are unwinding the stack.  Hence, all locations will be dead.
3863   // Callee-saved registers will be the same as the frame above (i.e.,
3864   // handle_exception_stub), since they were restored when we got the
3865   // exception.
3866 
3867   OopMapSet* oop_maps = new OopMapSet();
3868 
3869   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3870 
3871   __ reset_last_Java_frame(false);
3872 
3873   // Restore callee-saved registers
3874 
3875   // rbp is an implicitly saved callee-saved register (i.e., the calling
3876   // convention will save restore it in prolog/epilog) Other than that
3877   // there are no callee save registers now that adapter frames are gone.
3878 
3879   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3880 
3881   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3882   __ pop(rdx);                  // No need for exception pc anymore
3883 
3884   // rax: exception handler
3885 
3886   // We have a handler in rax (could be deopt blob).
3887   __ mov(r8, rax);
3888 
3889   // Get the exception oop
3890   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3891   // Get the exception pc in case we are deoptimized
3892   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3893 #ifdef ASSERT
3894   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3895   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3896 #endif
3897   // Clear the exception oop so GC no longer processes it as a root.
3898   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3899 
3900   // rax: exception oop
3901   // r8:  exception handler
3902   // rdx: exception pc
3903   // Jump to handler
3904 
3905   __ jmp(r8);
3906 
3907   // Make sure all code is generated
3908   masm->flush();
3909 
3910   // Set exception blob
3911   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3912 }
3913 #endif // COMPILER2
3914 
3915 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
3916                                        int total_in_args, const VMRegPair* in_regs,
3917                                        int total_out_args, VMRegPair* out_regs,
3918                                        GrowableArray<int>& arg_order,
3919                                        VMRegPair tmp_vmreg) {
3920   ComputeMoveOrder order(total_in_args, in_regs,
3921                          total_out_args, out_regs,
3922                          in_sig_bt, arg_order, tmp_vmreg);
3923 }