Old src/hotspot/cpu/x86/sharedRuntime_x86

   1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/icBuffer.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/compiledICHolder.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/jniHandles.hpp"
  48 #include "runtime/safepointMechanism.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/signature.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "runtime/vframeArray.hpp"
  53 #include "runtime/vm_version.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/formatBuffer.hpp"
  56 #include "vmreg_x86.inline.hpp"
  57 #ifdef COMPILER1
  58 #include "c1/c1_Runtime1.hpp"
  59 #endif
  60 #ifdef COMPILER2
  61 #include "opto/runtime.hpp"
  62 #endif
  63 #if INCLUDE_JVMCI
  64 #include "jvmci/jvmciJavaClasses.hpp"
  65 #endif
  66 
  67 #define __ masm->
  68 
  69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  70 
  71 class SimpleRuntimeFrame {
  72 
  73   public:
  74 
  75   // Most of the runtime stubs have this simple frame layout.
  76   // This class exists to make the layout shared in one place.
  77   // Offsets are for compiler stack slots, which are jints.
  78   enum layout {
  79     // The frame sender code expects that rbp will be in the "natural" place and
  80     // will override any oopMap setting for it. We must therefore force the layout
  81     // so that it agrees with the frame sender code.
  82     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  83     rbp_off2,
  84     return_off, return_off2,
  85     framesize
  86   };
  87 };
  88 
  89 class RegisterSaver {
  90   // Capture info about frame layout.  Layout offsets are in jint
  91   // units because compiler frame slots are jints.
  92 #define XSAVE_AREA_BEGIN 160
  93 #define XSAVE_AREA_YMM_BEGIN 576
  94 #define XSAVE_AREA_OPMASK_BEGIN 1088
  95 #define XSAVE_AREA_ZMM_BEGIN 1152
  96 #define XSAVE_AREA_UPPERBANK 1664
  97 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  98 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  99 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 100 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 102   enum layout {
 103     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 104     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 105     DEF_XMM_OFFS(0),
 106     DEF_XMM_OFFS(1),
 107     // 2..15 are implied in range usage
 108     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 109     DEF_YMM_OFFS(0),
 110     DEF_YMM_OFFS(1),
 111     // 2..15 are implied in range usage
 112     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 113     DEF_OPMASK_OFFS(0),
 114     DEF_OPMASK_OFFS(1),
 115     // 2..7 are implied in range usage
 116     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 117     DEF_ZMM_OFFS(0),
 118     DEF_ZMM_OFFS(1),
 119     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 120     DEF_ZMM_UPPER_OFFS(16),
 121     DEF_ZMM_UPPER_OFFS(17),
 122     // 18..31 are implied in range usage
 123     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 124     fpu_stateH_end,
 125     r15_off, r15H_off,
 126     r14_off, r14H_off,
 127     r13_off, r13H_off,
 128     r12_off, r12H_off,
 129     r11_off, r11H_off,
 130     r10_off, r10H_off,
 131     r9_off,  r9H_off,
 132     r8_off,  r8H_off,
 133     rdi_off, rdiH_off,
 134     rsi_off, rsiH_off,
 135     ignore_off, ignoreH_off,  // extra copy of rbp
 136     rsp_off, rspH_off,
 137     rbx_off, rbxH_off,
 138     rdx_off, rdxH_off,
 139     rcx_off, rcxH_off,
 140     rax_off, raxH_off,
 141     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 142     align_off, alignH_off,
 143     flags_off, flagsH_off,
 144     // The frame sender code expects that rbp will be in the "natural" place and
 145     // will override any oopMap setting for it. We must therefore force the layout
 146     // so that it agrees with the frame sender code.
 147     rbp_off, rbpH_off,        // copy of rbp we will restore
 148     return_off, returnH_off,  // slot for return address
 149     reg_save_size             // size in compiler stack slots
 150   };
 151 
 152  public:
 153   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 154   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 155 
 156   // Offsets into the register save area
 157   // Used by deoptimization when it is managing result register
 158   // values on its own
 159 
 160   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 161   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 162   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 163   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 164   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 165 
 166   // During deoptimization only the result registers need to be restored,
 167   // all the other values have already been extracted.
 168   static void restore_result_registers(MacroAssembler* masm);
 169 };
 170 
 171 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 172   int off = 0;
 173   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 174   if (UseAVX < 3) {
 175     num_xmm_regs = num_xmm_regs/2;
 176   }
 177 #if COMPILER2_OR_JVMCI
 178   if (save_wide_vectors && UseAVX == 0) {
 179     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 180   }
 181   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 182 #else
 183   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 184 #endif
 185 
 186   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 187   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 188   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 189   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 190   // CodeBlob frame size is in words.
 191   int frame_size_in_words = frame_size_in_bytes / wordSize;
 192   *total_frame_words = frame_size_in_words;
 193 
 194   // Save registers, fpu state, and flags.
 195   // We assume caller has already pushed the return address onto the
 196   // stack, so rsp is 8-byte aligned here.
 197   // We push rpb twice in this sequence because we want the real rbp
 198   // to be under the return like a normal enter.
 199 
 200   __ enter();          // rsp becomes 16-byte aligned here
 201   __ push_CPU_state(); // Push a multiple of 16 bytes
 202 
 203   // push cpu state handles this on EVEX enabled targets
 204   if (save_wide_vectors) {
 205     // Save upper half of YMM registers(0..15)
 206     int base_addr = XSAVE_AREA_YMM_BEGIN;
 207     for (int n = 0; n < 16; n++) {
 208       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 209     }
 210     if (VM_Version::supports_evex()) {
 211       // Save upper half of ZMM registers(0..15)
 212       base_addr = XSAVE_AREA_ZMM_BEGIN;
 213       for (int n = 0; n < 16; n++) {
 214         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 215       }
 216       // Save full ZMM registers(16..num_xmm_regs)
 217       base_addr = XSAVE_AREA_UPPERBANK;
 218       off = 0;
 219       int vector_len = Assembler::AVX_512bit;
 220       for (int n = 16; n < num_xmm_regs; n++) {
 221         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 222       }
 223 #if COMPILER2_OR_JVMCI
 224       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 225       off = 0;
 226       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 227         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 228       }
 229 #endif
 230     }
 231   } else {
 232     if (VM_Version::supports_evex()) {
 233       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 234       int base_addr = XSAVE_AREA_UPPERBANK;
 235       off = 0;
 236       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 237       for (int n = 16; n < num_xmm_regs; n++) {
 238         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 239       }
 240 #if COMPILER2_OR_JVMCI
 241       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 242       off = 0;
 243       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 244         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 245       }
 246 #endif
 247     }
 248   }
 249   __ vzeroupper();
 250   if (frame::arg_reg_save_area_bytes != 0) {
 251     // Allocate argument register save area
 252     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 253   }
 254 
 255   // Set an oopmap for the call site.  This oopmap will map all
 256   // oop-registers and debug-info registers as callee-saved.  This
 257   // will allow deoptimization at this safepoint to find all possible
 258   // debug-info recordings, as well as let GC find all oops.
 259 
 260   OopMapSet *oop_maps = new OopMapSet();
 261   OopMap* map = new OopMap(frame_size_in_slots, 0);
 262 
 263 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 264 
 265   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 266   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 267   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 269   // rbp location is known implicitly by the frame sender code, needs no oopmap
 270   // and the location where rbp was saved by is ignored
 271   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 272   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 273   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 281   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 282   // on EVEX enabled targets, we get it included in the xsave area
 283   off = xmm0_off;
 284   int delta = xmm1_off - off;
 285   for (int n = 0; n < 16; n++) {
 286     XMMRegister xmm_name = as_XMMRegister(n);
 287     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 288     off += delta;
 289   }
 290   if (UseAVX > 2) {
 291     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 292     off = zmm16_off;
 293     delta = zmm17_off - off;
 294     for (int n = 16; n < num_xmm_regs; n++) {
 295       XMMRegister zmm_name = as_XMMRegister(n);
 296       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 297       off += delta;
 298     }
 299   }
 300 
 301 #if COMPILER2_OR_JVMCI
 302   if (save_wide_vectors) {
 303     // Save upper half of YMM registers(0..15)
 304     off = ymm0_off;
 305     delta = ymm1_off - ymm0_off;
 306     for (int n = 0; n < 16; n++) {
 307       XMMRegister ymm_name = as_XMMRegister(n);
 308       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 309       off += delta;
 310     }
 311     if (VM_Version::supports_evex()) {
 312       // Save upper half of ZMM registers(0..15)
 313       off = zmm0_off;
 314       delta = zmm1_off - zmm0_off;
 315       for (int n = 0; n < 16; n++) {
 316         XMMRegister zmm_name = as_XMMRegister(n);
 317         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 318         off += delta;
 319       }
 320     }
 321   }
 322 #endif // COMPILER2_OR_JVMCI
 323 
 324   // %%% These should all be a waste but we'll keep things as they were for now
 325   if (true) {
 326     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 327     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 328     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 330     // rbp location is known implicitly by the frame sender code, needs no oopmap
 331     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 341     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 342     // on EVEX enabled targets, we get it included in the xsave area
 343     off = xmm0H_off;
 344     delta = xmm1H_off - off;
 345     for (int n = 0; n < 16; n++) {
 346       XMMRegister xmm_name = as_XMMRegister(n);
 347       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 348       off += delta;
 349     }
 350     if (UseAVX > 2) {
 351       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 352       off = zmm16H_off;
 353       delta = zmm17H_off - off;
 354       for (int n = 16; n < num_xmm_regs; n++) {
 355         XMMRegister zmm_name = as_XMMRegister(n);
 356         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 357         off += delta;
 358       }
 359     }
 360   }
 361 
 362   return map;
 363 }
 364 
 365 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 366   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 367   if (UseAVX < 3) {
 368     num_xmm_regs = num_xmm_regs/2;
 369   }
 370   if (frame::arg_reg_save_area_bytes != 0) {
 371     // Pop arg register save area
 372     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 373   }
 374 
 375 #if COMPILER2_OR_JVMCI
 376   if (restore_wide_vectors) {
 377     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 378     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 379   }
 380 #else
 381   assert(!restore_wide_vectors, "vectors are generated only by C2");
 382 #endif
 383 
 384   __ vzeroupper();
 385 
 386   // On EVEX enabled targets everything is handled in pop fpu state
 387   if (restore_wide_vectors) {
 388     // Restore upper half of YMM registers (0..15)
 389     int base_addr = XSAVE_AREA_YMM_BEGIN;
 390     for (int n = 0; n < 16; n++) {
 391       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 392     }
 393     if (VM_Version::supports_evex()) {
 394       // Restore upper half of ZMM registers (0..15)
 395       base_addr = XSAVE_AREA_ZMM_BEGIN;
 396       for (int n = 0; n < 16; n++) {
 397         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 398       }
 399       // Restore full ZMM registers(16..num_xmm_regs)
 400       base_addr = XSAVE_AREA_UPPERBANK;
 401       int vector_len = Assembler::AVX_512bit;
 402       int off = 0;
 403       for (int n = 16; n < num_xmm_regs; n++) {
 404         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 405       }
 406 #if COMPILER2_OR_JVMCI
 407       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 408       off = 0;
 409       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 410         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 411       }
 412 #endif
 413     }
 414   } else {
 415     if (VM_Version::supports_evex()) {
 416       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 417       int base_addr = XSAVE_AREA_UPPERBANK;
 418       int off = 0;
 419       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 420       for (int n = 16; n < num_xmm_regs; n++) {
 421         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 422       }
 423 #if COMPILER2_OR_JVMCI
 424       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 425       off = 0;
 426       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 427         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 428       }
 429 #endif
 430     }
 431   }
 432 
 433   // Recover CPU state
 434   __ pop_CPU_state();
 435   // Get the rbp described implicitly by the calling convention (no oopMap)
 436   __ pop(rbp);
 437 }
 438 
 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 440 
 441   // Just restore result register. Only used by deoptimization. By
 442   // now any callee save register that needs to be restored to a c2
 443   // caller of the deoptee has been extracted into the vframeArray
 444   // and will be stuffed into the c2i adapter we create for later
 445   // restoration so only result registers need to be restored here.
 446 
 447   // Restore fp result register
 448   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 449   // Restore integer result register
 450   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 451   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 452 
 453   // Pop all of the register save are off the stack except the return address
 454   __ addptr(rsp, return_offset_in_bytes());
 455 }
 456 
 457 // Is vector's size (in bytes) bigger than a size saved by default?
 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 459 bool SharedRuntime::is_wide_vector(int size) {
 460   return size > 16;
 461 }
 462 
 463 // ---------------------------------------------------------------------------
 464 // Read the array of BasicTypes from a signature, and compute where the
 465 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 466 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 467 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 468 // as framesizes are fixed.
 469 // VMRegImpl::stack0 refers to the first slot 0(sp).
 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 471 // up to RegisterImpl::number_of_registers) are the 64-bit
 472 // integer registers.
 473 
 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 475 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 476 // units regardless of build. Of course for i486 there is no 64 bit build
 477 
 478 // The Java calling convention is a "shifted" version of the C ABI.
 479 // By skipping the first C ABI register we can call non-static jni methods
 480 // with small numbers of arguments without having to shuffle the arguments
 481 // at all. Since we control the java ABI we ought to at least get some
 482 // advantage out of it.
 483 
 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 485                                            VMRegPair *regs,
 486                                            int total_args_passed) {
 487 
 488   // Create the mapping between argument positions and
 489   // registers.
 490   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 491     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 492   };
 493   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 494     j_farg0, j_farg1, j_farg2, j_farg3,
 495     j_farg4, j_farg5, j_farg6, j_farg7
 496   };
 497 
 498 
 499   uint int_args = 0;
 500   uint fp_args = 0;
 501   uint stk_args = 0; // inc by 2 each time
 502 
 503   for (int i = 0; i < total_args_passed; i++) {
 504     switch (sig_bt[i]) {
 505     case T_BOOLEAN:
 506     case T_CHAR:
 507     case T_BYTE:
 508     case T_SHORT:
 509     case T_INT:
 510       if (int_args < Argument::n_int_register_parameters_j) {
 511         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 512       } else {
 513         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 514         stk_args += 2;
 515       }
 516       break;
 517     case T_VOID:
 518       // halves of T_LONG or T_DOUBLE
 519       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 520       regs[i].set_bad();
 521       break;
 522     case T_LONG:
 523       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 524       // fall through
 525     case T_OBJECT:
 526     case T_ARRAY:
 527     case T_ADDRESS:
 528       if (int_args < Argument::n_int_register_parameters_j) {
 529         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 530       } else {
 531         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 532         stk_args += 2;
 533       }
 534       break;
 535     case T_FLOAT:
 536       if (fp_args < Argument::n_float_register_parameters_j) {
 537         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 538       } else {
 539         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 540         stk_args += 2;
 541       }
 542       break;
 543     case T_DOUBLE:
 544       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 545       if (fp_args < Argument::n_float_register_parameters_j) {
 546         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 547       } else {
 548         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 549         stk_args += 2;
 550       }
 551       break;
 552     default:
 553       ShouldNotReachHere();
 554       break;
 555     }
 556   }
 557 
 558   return align_up(stk_args, 2);
 559 }
 560 
 561 // Patch the callers callsite with entry to compiled code if it exists.
 562 static void patch_callers_callsite(MacroAssembler *masm) {
 563   Label L;
 564   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 565   __ jcc(Assembler::equal, L);
 566 
 567   // Save the current stack pointer
 568   __ mov(r13, rsp);
 569   // Schedule the branch target address early.
 570   // Call into the VM to patch the caller, then jump to compiled callee
 571   // rax isn't live so capture return address while we easily can
 572   __ movptr(rax, Address(rsp, 0));
 573 
 574   // align stack so push_CPU_state doesn't fault
 575   __ andptr(rsp, -(StackAlignmentInBytes));
 576   __ push_CPU_state();
 577   __ vzeroupper();
 578   // VM needs caller's callsite
 579   // VM needs target method
 580   // This needs to be a long call since we will relocate this adapter to
 581   // the codeBuffer and it may not reach
 582 
 583   // Allocate argument register save area
 584   if (frame::arg_reg_save_area_bytes != 0) {
 585     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 586   }
 587   __ mov(c_rarg0, rbx);
 588   __ mov(c_rarg1, rax);
 589   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 590 
 591   // De-allocate argument register save area
 592   if (frame::arg_reg_save_area_bytes != 0) {
 593     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 594   }
 595 
 596   __ vzeroupper();
 597   __ pop_CPU_state();
 598   // restore sp
 599   __ mov(rsp, r13);
 600   __ bind(L);
 601 }
 602 
 603 
 604 static void gen_c2i_adapter(MacroAssembler *masm,
 605                             int total_args_passed,
 606                             int comp_args_on_stack,
 607                             const BasicType *sig_bt,
 608                             const VMRegPair *regs,
 609                             Label& skip_fixup) {
 610   // Before we get into the guts of the C2I adapter, see if we should be here
 611   // at all.  We've come from compiled code and are attempting to jump to the
 612   // interpreter, which means the caller made a static call to get here
 613   // (vcalls always get a compiled target if there is one).  Check for a
 614   // compiled target.  If there is one, we need to patch the caller's call.
 615   patch_callers_callsite(masm);
 616 
 617   __ bind(skip_fixup);
 618 
 619   // Since all args are passed on the stack, total_args_passed *
 620   // Interpreter::stackElementSize is the space we need. Plus 1 because
 621   // we also account for the return address location since
 622   // we store it first rather than hold it in rax across all the shuffling
 623 
 624   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 625 
 626   // stack is aligned, keep it that way
 627   extraspace = align_up(extraspace, 2*wordSize);
 628 
 629   // Get return address
 630   __ pop(rax);
 631 
 632   // set senderSP value
 633   __ mov(r13, rsp);
 634 
 635   __ subptr(rsp, extraspace);
 636 
 637   // Store the return address in the expected location
 638   __ movptr(Address(rsp, 0), rax);
 639 
 640   // Now write the args into the outgoing interpreter space
 641   for (int i = 0; i < total_args_passed; i++) {
 642     if (sig_bt[i] == T_VOID) {
 643       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 644       continue;
 645     }
 646 
 647     // offset to start parameters
 648     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 649     int next_off = st_off - Interpreter::stackElementSize;
 650 
 651     // Say 4 args:
 652     // i   st_off
 653     // 0   32 T_LONG
 654     // 1   24 T_VOID
 655     // 2   16 T_OBJECT
 656     // 3    8 T_BOOL
 657     // -    0 return address
 658     //
 659     // However to make thing extra confusing. Because we can fit a long/double in
 660     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 661     // leaves one slot empty and only stores to a single slot. In this case the
 662     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 663 
 664     VMReg r_1 = regs[i].first();
 665     VMReg r_2 = regs[i].second();
 666     if (!r_1->is_valid()) {
 667       assert(!r_2->is_valid(), "");
 668       continue;
 669     }
 670     if (r_1->is_stack()) {
 671       // memory to memory use rax
 672       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 673       if (!r_2->is_valid()) {
 674         // sign extend??
 675         __ movl(rax, Address(rsp, ld_off));
 676         __ movptr(Address(rsp, st_off), rax);
 677 
 678       } else {
 679 
 680         __ movq(rax, Address(rsp, ld_off));
 681 
 682         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 683         // T_DOUBLE and T_LONG use two slots in the interpreter
 684         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 685           // ld_off == LSW, ld_off+wordSize == MSW
 686           // st_off == MSW, next_off == LSW
 687           __ movq(Address(rsp, next_off), rax);
 688 #ifdef ASSERT
 689           // Overwrite the unused slot with known junk
 690           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 691           __ movptr(Address(rsp, st_off), rax);
 692 #endif /* ASSERT */
 693         } else {
 694           __ movq(Address(rsp, st_off), rax);
 695         }
 696       }
 697     } else if (r_1->is_Register()) {
 698       Register r = r_1->as_Register();
 699       if (!r_2->is_valid()) {
 700         // must be only an int (or less ) so move only 32bits to slot
 701         // why not sign extend??
 702         __ movl(Address(rsp, st_off), r);
 703       } else {
 704         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 705         // T_DOUBLE and T_LONG use two slots in the interpreter
 706         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 707           // long/double in gpr
 708 #ifdef ASSERT
 709           // Overwrite the unused slot with known junk
 710           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 711           __ movptr(Address(rsp, st_off), rax);
 712 #endif /* ASSERT */
 713           __ movq(Address(rsp, next_off), r);
 714         } else {
 715           __ movptr(Address(rsp, st_off), r);
 716         }
 717       }
 718     } else {
 719       assert(r_1->is_XMMRegister(), "");
 720       if (!r_2->is_valid()) {
 721         // only a float use just part of the slot
 722         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 723       } else {
 724 #ifdef ASSERT
 725         // Overwrite the unused slot with known junk
 726         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 727         __ movptr(Address(rsp, st_off), rax);
 728 #endif /* ASSERT */
 729         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 730       }
 731     }
 732   }
 733 
 734   // Schedule the branch target address early.
 735   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 736   __ jmp(rcx);
 737 }
 738 
 739 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 740                         address code_start, address code_end,
 741                         Label& L_ok) {
 742   Label L_fail;
 743   __ lea(temp_reg, ExternalAddress(code_start));
 744   __ cmpptr(pc_reg, temp_reg);
 745   __ jcc(Assembler::belowEqual, L_fail);
 746   __ lea(temp_reg, ExternalAddress(code_end));
 747   __ cmpptr(pc_reg, temp_reg);
 748   __ jcc(Assembler::below, L_ok);
 749   __ bind(L_fail);
 750 }
 751 
 752 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 753                                     int total_args_passed,
 754                                     int comp_args_on_stack,
 755                                     const BasicType *sig_bt,
 756                                     const VMRegPair *regs) {
 757 
 758   // Note: r13 contains the senderSP on entry. We must preserve it since
 759   // we may do a i2c -> c2i transition if we lose a race where compiled
 760   // code goes non-entrant while we get args ready.
 761   // In addition we use r13 to locate all the interpreter args as
 762   // we must align the stack to 16 bytes on an i2c entry else we
 763   // lose alignment we expect in all compiled code and register
 764   // save code can segv when fxsave instructions find improperly
 765   // aligned stack pointer.
 766 
 767   // Adapters can be frameless because they do not require the caller
 768   // to perform additional cleanup work, such as correcting the stack pointer.
 769   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 770   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 771   // even if a callee has modified the stack pointer.
 772   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 773   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 774   // up via the senderSP register).
 775   // In other words, if *either* the caller or callee is interpreted, we can
 776   // get the stack pointer repaired after a call.
 777   // This is why c2i and i2c adapters cannot be indefinitely composed.
 778   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 779   // both caller and callee would be compiled methods, and neither would
 780   // clean up the stack pointer changes performed by the two adapters.
 781   // If this happens, control eventually transfers back to the compiled
 782   // caller, but with an uncorrected stack, causing delayed havoc.
 783 
 784   // Pick up the return address
 785   __ movptr(rax, Address(rsp, 0));
 786 
 787   if (VerifyAdapterCalls &&
 788       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 789     // So, let's test for cascading c2i/i2c adapters right now.
 790     //  assert(Interpreter::contains($return_addr) ||
 791     //         StubRoutines::contains($return_addr),
 792     //         "i2c adapter must return to an interpreter frame");
 793     __ block_comment("verify_i2c { ");
 794     Label L_ok;
 795     if (Interpreter::code() != NULL)
 796       range_check(masm, rax, r11,
 797                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 798                   L_ok);
 799     if (StubRoutines::code1() != NULL)
 800       range_check(masm, rax, r11,
 801                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 802                   L_ok);
 803     if (StubRoutines::code2() != NULL)
 804       range_check(masm, rax, r11,
 805                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 806                   L_ok);
 807     const char* msg = "i2c adapter must return to an interpreter frame";
 808     __ block_comment(msg);
 809     __ stop(msg);
 810     __ bind(L_ok);
 811     __ block_comment("} verify_i2ce ");
 812   }
 813 
 814   // Must preserve original SP for loading incoming arguments because
 815   // we need to align the outgoing SP for compiled code.
 816   __ movptr(r11, rsp);
 817 
 818   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
 819   // in registers, we will occasionally have no stack args.
 820   int comp_words_on_stack = 0;
 821   if (comp_args_on_stack) {
 822     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
 823     // registers are below.  By subtracting stack0, we either get a negative
 824     // number (all values in registers) or the maximum stack slot accessed.
 825 
 826     // Convert 4-byte c2 stack slots to words.
 827     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 828     // Round up to miminum stack alignment, in wordSize
 829     comp_words_on_stack = align_up(comp_words_on_stack, 2);
 830     __ subptr(rsp, comp_words_on_stack * wordSize);
 831   }
 832 
 833 
 834   // Ensure compiled code always sees stack at proper alignment
 835   __ andptr(rsp, -16);
 836 
 837   // push the return address and misalign the stack that youngest frame always sees
 838   // as far as the placement of the call instruction
 839   __ push(rax);
 840 
 841   // Put saved SP in another register
 842   const Register saved_sp = rax;
 843   __ movptr(saved_sp, r11);
 844 
 845   // Will jump to the compiled code just as if compiled code was doing it.
 846   // Pre-load the register-jump target early, to schedule it better.
 847   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 848 
 849 #if INCLUDE_JVMCI
 850   if (EnableJVMCI) {
 851     // check if this call should be routed towards a specific entry point
 852     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 853     Label no_alternative_target;
 854     __ jcc(Assembler::equal, no_alternative_target);
 855     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 856     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 857     __ bind(no_alternative_target);
 858   }
 859 #endif // INCLUDE_JVMCI
 860 
 861   // Now generate the shuffle code.  Pick up all register args and move the
 862   // rest through the floating point stack top.
 863   for (int i = 0; i < total_args_passed; i++) {
 864     if (sig_bt[i] == T_VOID) {
 865       // Longs and doubles are passed in native word order, but misaligned
 866       // in the 32-bit build.
 867       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 868       continue;
 869     }
 870 
 871     // Pick up 0, 1 or 2 words from SP+offset.
 872 
 873     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 874             "scrambled load targets?");
 875     // Load in argument order going down.
 876     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 877     // Point to interpreter value (vs. tag)
 878     int next_off = ld_off - Interpreter::stackElementSize;
 879     //
 880     //
 881     //
 882     VMReg r_1 = regs[i].first();
 883     VMReg r_2 = regs[i].second();
 884     if (!r_1->is_valid()) {
 885       assert(!r_2->is_valid(), "");
 886       continue;
 887     }
 888     if (r_1->is_stack()) {
 889       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 890       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 891 
 892       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 893       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 894       // will be generated.
 895       if (!r_2->is_valid()) {
 896         // sign extend???
 897         __ movl(r13, Address(saved_sp, ld_off));
 898         __ movptr(Address(rsp, st_off), r13);
 899       } else {
 900         //
 901         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 902         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 903         // So we must adjust where to pick up the data to match the interpreter.
 904         //
 905         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 906         // are accessed as negative so LSW is at LOW address
 907 
 908         // ld_off is MSW so get LSW
 909         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 910                            next_off : ld_off;
 911         __ movq(r13, Address(saved_sp, offset));
 912         // st_off is LSW (i.e. reg.first())
 913         __ movq(Address(rsp, st_off), r13);
 914       }
 915     } else if (r_1->is_Register()) {  // Register argument
 916       Register r = r_1->as_Register();
 917       assert(r != rax, "must be different");
 918       if (r_2->is_valid()) {
 919         //
 920         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 921         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 922         // So we must adjust where to pick up the data to match the interpreter.
 923 
 924         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 925                            next_off : ld_off;
 926 
 927         // this can be a misaligned move
 928         __ movq(r, Address(saved_sp, offset));
 929       } else {
 930         // sign extend and use a full word?
 931         __ movl(r, Address(saved_sp, ld_off));
 932       }
 933     } else {
 934       if (!r_2->is_valid()) {
 935         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 936       } else {
 937         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 938       }
 939     }
 940   }
 941 
 942   // 6243940 We might end up in handle_wrong_method if
 943   // the callee is deoptimized as we race thru here. If that
 944   // happens we don't want to take a safepoint because the
 945   // caller frame will look interpreted and arguments are now
 946   // "compiled" so it is much better to make this transition
 947   // invisible to the stack walking code. Unfortunately if
 948   // we try and find the callee by normal means a safepoint
 949   // is possible. So we stash the desired callee in the thread
 950   // and the vm will find there should this case occur.
 951 
 952   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 953 
 954   // put Method* where a c2i would expect should we end up there
 955   // only needed becaus eof c2 resolve stubs return Method* as a result in
 956   // rax
 957   __ mov(rax, rbx);
 958   __ jmp(r11);
 959 }
 960 
 961 // ---------------------------------------------------------------
 962 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 963                                                             int total_args_passed,
 964                                                             int comp_args_on_stack,
 965                                                             const BasicType *sig_bt,
 966                                                             const VMRegPair *regs,
 967                                                             AdapterFingerPrint* fingerprint) {
 968   address i2c_entry = __ pc();
 969 
 970   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 971 
 972   // -------------------------------------------------------------------------
 973   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 974   // to the interpreter.  The args start out packed in the compiled layout.  They
 975   // need to be unpacked into the interpreter layout.  This will almost always
 976   // require some stack space.  We grow the current (compiled) stack, then repack
 977   // the args.  We  finally end in a jump to the generic interpreter entry point.
 978   // On exit from the interpreter, the interpreter will restore our SP (lest the
 979   // compiled code, which relys solely on SP and not RBP, get sick).
 980 
 981   address c2i_unverified_entry = __ pc();
 982   Label skip_fixup;
 983   Label ok;
 984 
 985   Register holder = rax;
 986   Register receiver = j_rarg0;
 987   Register temp = rbx;
 988 
 989   {
 990     __ load_klass(temp, receiver, rscratch1);
 991     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
 992     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
 993     __ jcc(Assembler::equal, ok);
 994     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 995 
 996     __ bind(ok);
 997     // Method might have been compiled since the call site was patched to
 998     // interpreted if that is the case treat it as a miss so we can get
 999     // the call site corrected.
1000     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1001     __ jcc(Assembler::equal, skip_fixup);
1002     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1003   }
1004 
1005   address c2i_entry = __ pc();
1006 
1007   // Class initialization barrier for static methods
1008   address c2i_no_clinit_check_entry = NULL;
1009   if (VM_Version::supports_fast_class_init_checks()) {
1010     Label L_skip_barrier;
1011     Register method = rbx;
1012 
1013     { // Bypass the barrier for non-static methods
1014       Register flags  = rscratch1;
1015       __ movl(flags, Address(method, Method::access_flags_offset()));
1016       __ testl(flags, JVM_ACC_STATIC);
1017       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1018     }
1019 
1020     Register klass = rscratch1;
1021     __ load_method_holder(klass, method);
1022     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1023 
1024     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1025 
1026     __ bind(L_skip_barrier);
1027     c2i_no_clinit_check_entry = __ pc();
1028   }
1029 
1030   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1031   bs->c2i_entry_barrier(masm);
1032 
1033   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1034 
1035   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1036 }
1037 
1038 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1039                                          VMRegPair *regs,
1040                                          VMRegPair *regs2,
1041                                          int total_args_passed) {
1042   assert(regs2 == NULL, "not needed on x86");
1043 // We return the amount of VMRegImpl stack slots we need to reserve for all
1044 // the arguments NOT counting out_preserve_stack_slots.
1045 
1046 // NOTE: These arrays will have to change when c1 is ported
1047 #ifdef _WIN64
1048     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1049       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1050     };
1051     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1052       c_farg0, c_farg1, c_farg2, c_farg3
1053     };
1054 #else
1055     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1056       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1057     };
1058     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1059       c_farg0, c_farg1, c_farg2, c_farg3,
1060       c_farg4, c_farg5, c_farg6, c_farg7
1061     };
1062 #endif // _WIN64
1063 
1064 
1065     uint int_args = 0;
1066     uint fp_args = 0;
1067     uint stk_args = 0; // inc by 2 each time
1068 
1069     for (int i = 0; i < total_args_passed; i++) {
1070       switch (sig_bt[i]) {
1071       case T_BOOLEAN:
1072       case T_CHAR:
1073       case T_BYTE:
1074       case T_SHORT:
1075       case T_INT:
1076         if (int_args < Argument::n_int_register_parameters_c) {
1077           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1078 #ifdef _WIN64
1079           fp_args++;
1080           // Allocate slots for callee to stuff register args the stack.
1081           stk_args += 2;
1082 #endif
1083         } else {
1084           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1085           stk_args += 2;
1086         }
1087         break;
1088       case T_LONG:
1089         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1090         // fall through
1091       case T_OBJECT:
1092       case T_ARRAY:
1093       case T_ADDRESS:
1094       case T_METADATA:
1095         if (int_args < Argument::n_int_register_parameters_c) {
1096           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1097 #ifdef _WIN64
1098           fp_args++;
1099           stk_args += 2;
1100 #endif
1101         } else {
1102           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1103           stk_args += 2;
1104         }
1105         break;
1106       case T_FLOAT:
1107         if (fp_args < Argument::n_float_register_parameters_c) {
1108           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1109 #ifdef _WIN64
1110           int_args++;
1111           // Allocate slots for callee to stuff register args the stack.
1112           stk_args += 2;
1113 #endif
1114         } else {
1115           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1116           stk_args += 2;
1117         }
1118         break;
1119       case T_DOUBLE:
1120         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1121         if (fp_args < Argument::n_float_register_parameters_c) {
1122           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1123 #ifdef _WIN64
1124           int_args++;
1125           // Allocate slots for callee to stuff register args the stack.
1126           stk_args += 2;
1127 #endif
1128         } else {
1129           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1130           stk_args += 2;
1131         }
1132         break;
1133       case T_VOID: // Halves of longs and doubles
1134         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1135         regs[i].set_bad();
1136         break;
1137       default:
1138         ShouldNotReachHere();
1139         break;
1140       }
1141     }
1142 #ifdef _WIN64
1143   // windows abi requires that we always allocate enough stack space
1144   // for 4 64bit registers to be stored down.
1145   if (stk_args < 8) {
1146     stk_args = 8;
1147   }
1148 #endif // _WIN64
1149 
1150   return stk_args;
1151 }
1152 
1153 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1154                                              uint num_bits,
1155                                              uint total_args_passed) {
1156   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1157          "only certain vector sizes are supported for now");
1158 
1159   static const XMMRegister VEC_ArgReg[32] = {
1160      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1161      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1162     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1163     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1164   };
1165 
1166   uint stk_args = 0;
1167   uint fp_args = 0;
1168 
1169   for (uint i = 0; i < total_args_passed; i++) {
1170     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1171     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1172     regs[i].set_pair(vmreg->next(next_val), vmreg);
1173   }
1174 
1175   return stk_args;
1176 }
1177 
1178 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1179   // We always ignore the frame_slots arg and just use the space just below frame pointer
1180   // which by this time is free to use
1181   switch (ret_type) {
1182   case T_FLOAT:
1183     __ movflt(Address(rbp, -wordSize), xmm0);
1184     break;
1185   case T_DOUBLE:
1186     __ movdbl(Address(rbp, -wordSize), xmm0);
1187     break;
1188   case T_VOID:  break;
1189   default: {
1190     __ movptr(Address(rbp, -wordSize), rax);
1191     }
1192   }
1193 }
1194 
1195 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1196   // We always ignore the frame_slots arg and just use the space just below frame pointer
1197   // which by this time is free to use
1198   switch (ret_type) {
1199   case T_FLOAT:
1200     __ movflt(xmm0, Address(rbp, -wordSize));
1201     break;
1202   case T_DOUBLE:
1203     __ movdbl(xmm0, Address(rbp, -wordSize));
1204     break;
1205   case T_VOID:  break;
1206   default: {
1207     __ movptr(rax, Address(rbp, -wordSize));
1208     }
1209   }
1210 }
1211 
1212 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1213     for ( int i = first_arg ; i < arg_count ; i++ ) {
1214       if (args[i].first()->is_Register()) {
1215         __ push(args[i].first()->as_Register());
1216       } else if (args[i].first()->is_XMMRegister()) {
1217         __ subptr(rsp, 2*wordSize);
1218         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1219       }
1220     }
1221 }
1222 
1223 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1224     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1225       if (args[i].first()->is_Register()) {
1226         __ pop(args[i].first()->as_Register());
1227       } else if (args[i].first()->is_XMMRegister()) {
1228         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1229         __ addptr(rsp, 2*wordSize);
1230       }
1231     }
1232 }
1233 
1234 // Unpack an array argument into a pointer to the body and the length
1235 // if the array is non-null, otherwise pass 0 for both.
1236 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
1237   Register tmp_reg = rax;
1238   assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
1239          "possible collision");
1240   assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
1241          "possible collision");
1242 
1243   __ block_comment("unpack_array_argument {");
1244 
1245   // Pass the length, ptr pair
1246   Label is_null, done;
1247   VMRegPair tmp;
1248   tmp.set_ptr(tmp_reg->as_VMReg());
1249   if (reg.first()->is_stack()) {
1250     // Load the arg up from the stack
1251     __ move_ptr(reg, tmp);
1252     reg = tmp;
1253   }
1254   __ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1255   __ jccb(Assembler::equal, is_null);
1256   __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1257   __ move_ptr(tmp, body_arg);
1258   // load the length relative to the body.
1259   __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() -
1260                            arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1261   __ move32_64(tmp, length_arg);
1262   __ jmpb(done);
1263   __ bind(is_null);
1264   // Pass zeros
1265   __ xorptr(tmp_reg, tmp_reg);
1266   __ move_ptr(tmp, body_arg);
1267   __ move32_64(tmp, length_arg);
1268   __ bind(done);
1269 
1270   __ block_comment("} unpack_array_argument");
1271 }
1272 
1273 
1274 // Different signatures may require very different orders for the move
1275 // to avoid clobbering other arguments.  There's no simple way to
1276 // order them safely.  Compute a safe order for issuing stores and
1277 // break any cycles in those stores.  This code is fairly general but
1278 // it's not necessary on the other platforms so we keep it in the
1279 // platform dependent code instead of moving it into a shared file.
1280 // (See bugs 7013347 & 7145024.)
1281 // Note that this code is specific to LP64.
1282 class ComputeMoveOrder: public StackObj {
1283   class MoveOperation: public ResourceObj {
1284     friend class ComputeMoveOrder;
1285    private:
1286     VMRegPair        _src;
1287     VMRegPair        _dst;
1288     int              _src_index;
1289     int              _dst_index;
1290     bool             _processed;
1291     MoveOperation*  _next;
1292     MoveOperation*  _prev;
1293 
1294     static int get_id(VMRegPair r) {
1295       return r.first()->value();
1296     }
1297 
1298    public:
1299     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1300       _src(src)
1301     , _dst(dst)
1302     , _src_index(src_index)
1303     , _dst_index(dst_index)
1304     , _processed(false)
1305     , _next(NULL)
1306     , _prev(NULL) {
1307     }
1308 
1309     VMRegPair src() const              { return _src; }
1310     int src_id() const                 { return get_id(src()); }
1311     int src_index() const              { return _src_index; }
1312     VMRegPair dst() const              { return _dst; }
1313     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1314     int dst_index() const              { return _dst_index; }
1315     int dst_id() const                 { return get_id(dst()); }
1316     MoveOperation* next() const       { return _next; }
1317     MoveOperation* prev() const       { return _prev; }
1318     void set_processed()               { _processed = true; }
1319     bool is_processed() const          { return _processed; }
1320 
1321     // insert
1322     void break_cycle(VMRegPair temp_register) {
1323       // create a new store following the last store
1324       // to move from the temp_register to the original
1325       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1326 
1327       // break the cycle of links and insert new_store at the end
1328       // break the reverse link.
1329       MoveOperation* p = prev();
1330       assert(p->next() == this, "must be");
1331       _prev = NULL;
1332       p->_next = new_store;
1333       new_store->_prev = p;
1334 
1335       // change the original store to save it's value in the temp.
1336       set_dst(-1, temp_register);
1337     }
1338 
1339     void link(GrowableArray<MoveOperation*>& killer) {
1340       // link this store in front the store that it depends on
1341       MoveOperation* n = killer.at_grow(src_id(), NULL);
1342       if (n != NULL) {
1343         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1344         _next = n;
1345         n->_prev = this;
1346       }
1347     }
1348   };
1349 
1350  private:
1351   GrowableArray<MoveOperation*> edges;
1352 
1353  public:
1354   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1355                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1356     // Move operations where the dest is the stack can all be
1357     // scheduled first since they can't interfere with the other moves.
1358     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1359       if (in_sig_bt[i] == T_ARRAY) {
1360         c_arg--;
1361         if (out_regs[c_arg].first()->is_stack() &&
1362             out_regs[c_arg + 1].first()->is_stack()) {
1363           arg_order.push(i);
1364           arg_order.push(c_arg);
1365         } else {
1366           if (out_regs[c_arg].first()->is_stack() ||
1367               in_regs[i].first() == out_regs[c_arg].first()) {
1368             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1369           } else {
1370             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1371           }
1372         }
1373       } else if (in_sig_bt[i] == T_VOID) {
1374         arg_order.push(i);
1375         arg_order.push(c_arg);
1376       } else {
1377         if (out_regs[c_arg].first()->is_stack() ||
1378             in_regs[i].first() == out_regs[c_arg].first()) {
1379           arg_order.push(i);
1380           arg_order.push(c_arg);
1381         } else {
1382           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1383         }
1384       }
1385     }
1386     // Break any cycles in the register moves and emit the in the
1387     // proper order.
1388     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1389     for (int i = 0; i < stores->length(); i++) {
1390       arg_order.push(stores->at(i)->src_index());
1391       arg_order.push(stores->at(i)->dst_index());
1392     }
1393  }
1394 
1395   // Collected all the move operations
1396   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1397     if (src.first() == dst.first()) return;
1398     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1399   }
1400 
1401   // Walk the edges breaking cycles between moves.  The result list
1402   // can be walked in order to produce the proper set of loads
1403   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1404     // Record which moves kill which values
1405     GrowableArray<MoveOperation*> killer;
1406     for (int i = 0; i < edges.length(); i++) {
1407       MoveOperation* s = edges.at(i);
1408       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1409       killer.at_put_grow(s->dst_id(), s, NULL);
1410     }
1411     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1412            "make sure temp isn't in the registers that are killed");
1413 
1414     // create links between loads and stores
1415     for (int i = 0; i < edges.length(); i++) {
1416       edges.at(i)->link(killer);
1417     }
1418 
1419     // at this point, all the move operations are chained together
1420     // in a doubly linked list.  Processing it backwards finds
1421     // the beginning of the chain, forwards finds the end.  If there's
1422     // a cycle it can be broken at any point,  so pick an edge and walk
1423     // backward until the list ends or we end where we started.
1424     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1425     for (int e = 0; e < edges.length(); e++) {
1426       MoveOperation* s = edges.at(e);
1427       if (!s->is_processed()) {
1428         MoveOperation* start = s;
1429         // search for the beginning of the chain or cycle
1430         while (start->prev() != NULL && start->prev() != s) {
1431           start = start->prev();
1432         }
1433         if (start->prev() == s) {
1434           start->break_cycle(temp_register);
1435         }
1436         // walk the chain forward inserting to store list
1437         while (start != NULL) {
1438           stores->append(start);
1439           start->set_processed();
1440           start = start->next();
1441         }
1442       }
1443     }
1444     return stores;
1445   }
1446 };
1447 
1448 static void verify_oop_args(MacroAssembler* masm,
1449                             const methodHandle& method,
1450                             const BasicType* sig_bt,
1451                             const VMRegPair* regs) {
1452   Register temp_reg = rbx;  // not part of any compiled calling seq
1453   if (VerifyOops) {
1454     for (int i = 0; i < method->size_of_parameters(); i++) {
1455       if (is_reference_type(sig_bt[i])) {
1456         VMReg r = regs[i].first();
1457         assert(r->is_valid(), "bad oop arg");
1458         if (r->is_stack()) {
1459           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1460           __ verify_oop(temp_reg);
1461         } else {
1462           __ verify_oop(r->as_Register());
1463         }
1464       }
1465     }
1466   }
1467 }
1468 
1469 static void gen_special_dispatch(MacroAssembler* masm,
1470                                  const methodHandle& method,
1471                                  const BasicType* sig_bt,
1472                                  const VMRegPair* regs) {
1473   verify_oop_args(masm, method, sig_bt, regs);
1474   vmIntrinsics::ID iid = method->intrinsic_id();
1475 
1476   // Now write the args into the outgoing interpreter space
1477   bool     has_receiver   = false;
1478   Register receiver_reg   = noreg;
1479   int      member_arg_pos = -1;
1480   Register member_reg     = noreg;
1481   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1482   if (ref_kind != 0) {
1483     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1484     member_reg = rbx;  // known to be free at this point
1485     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1486   } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1487     has_receiver = true;
1488   } else {
1489     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1490   }
1491 
1492   if (member_reg != noreg) {
1493     // Load the member_arg into register, if necessary.
1494     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1495     VMReg r = regs[member_arg_pos].first();
1496     if (r->is_stack()) {
1497       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1498     } else {
1499       // no data motion is needed
1500       member_reg = r->as_Register();
1501     }
1502   }
1503 
1504   if (has_receiver) {
1505     // Make sure the receiver is loaded into a register.
1506     assert(method->size_of_parameters() > 0, "oob");
1507     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1508     VMReg r = regs[0].first();
1509     assert(r->is_valid(), "bad receiver arg");
1510     if (r->is_stack()) {
1511       // Porting note:  This assumes that compiled calling conventions always
1512       // pass the receiver oop in a register.  If this is not true on some
1513       // platform, pick a temp and load the receiver from stack.
1514       fatal("receiver always in a register");
1515       receiver_reg = j_rarg0;  // known to be free at this point
1516       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1517     } else {
1518       // no data motion is needed
1519       receiver_reg = r->as_Register();
1520     }
1521   }
1522 
1523   // Figure out which address we are really jumping to:
1524   MethodHandles::generate_method_handle_dispatch(masm, iid,
1525                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1526 }
1527 
1528 // ---------------------------------------------------------------------------
1529 // Generate a native wrapper for a given method.  The method takes arguments
1530 // in the Java compiled code convention, marshals them to the native
1531 // convention (handlizes oops, etc), transitions to native, makes the call,
1532 // returns to java state (possibly blocking), unhandlizes any result and
1533 // returns.
1534 //
1535 // Critical native functions are a shorthand for the use of
1536 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1537 // functions.  The wrapper is expected to unpack the arguments before
1538 // passing them to the callee. Critical native functions leave the state _in_Java,
1539 // since they cannot stop for GC.
1540 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1541 // block and the check for pending exceptions it's impossible for them
1542 // to be thrown.
1543 //
1544 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1545                                                 const methodHandle& method,
1546                                                 int compile_id,
1547                                                 BasicType* in_sig_bt,
1548                                                 VMRegPair* in_regs,
1549                                                 BasicType ret_type,
1550                                                 address critical_entry) {
1551   if (method->is_method_handle_intrinsic()) {
1552     vmIntrinsics::ID iid = method->intrinsic_id();
1553     intptr_t start = (intptr_t)__ pc();
1554     int vep_offset = ((intptr_t)__ pc()) - start;
1555     gen_special_dispatch(masm,
1556                          method,
1557                          in_sig_bt,
1558                          in_regs);
1559     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1560     __ flush();
1561     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1562     return nmethod::new_native_nmethod(method,
1563                                        compile_id,
1564                                        masm->code(),
1565                                        vep_offset,
1566                                        frame_complete,
1567                                        stack_slots / VMRegImpl::slots_per_word,
1568                                        in_ByteSize(-1),
1569                                        in_ByteSize(-1),
1570                                        (OopMapSet*)NULL);
1571   }
1572   bool is_critical_native = true;
1573   address native_func = critical_entry;
1574   if (native_func == NULL) {
1575     native_func = method->native_function();
1576     is_critical_native = false;
1577   }
1578   assert(native_func != NULL, "must have function");
1579 
1580   // An OopMap for lock (and class if static)
1581   OopMapSet *oop_maps = new OopMapSet();
1582   intptr_t start = (intptr_t)__ pc();
1583 
1584   // We have received a description of where all the java arg are located
1585   // on entry to the wrapper. We need to convert these args to where
1586   // the jni function will expect them. To figure out where they go
1587   // we convert the java signature to a C signature by inserting
1588   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1589 
1590   const int total_in_args = method->size_of_parameters();
1591   int total_c_args = total_in_args;
1592   if (!is_critical_native) {
1593     total_c_args += 1;
1594     if (method->is_static()) {
1595       total_c_args++;
1596     }
1597   } else {
1598     for (int i = 0; i < total_in_args; i++) {
1599       if (in_sig_bt[i] == T_ARRAY) {
1600         total_c_args++;
1601       }
1602     }
1603   }
1604 
1605   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1606   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1607   BasicType* in_elem_bt = NULL;
1608 
1609   int argc = 0;
1610   if (!is_critical_native) {
1611     out_sig_bt[argc++] = T_ADDRESS;
1612     if (method->is_static()) {
1613       out_sig_bt[argc++] = T_OBJECT;
1614     }
1615 
1616     for (int i = 0; i < total_in_args ; i++ ) {
1617       out_sig_bt[argc++] = in_sig_bt[i];
1618     }
1619   } else {
1620     in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
1621     SignatureStream ss(method->signature());
1622     for (int i = 0; i < total_in_args ; i++ ) {
1623       if (in_sig_bt[i] == T_ARRAY) {
1624         // Arrays are passed as int, elem* pair
1625         out_sig_bt[argc++] = T_INT;
1626         out_sig_bt[argc++] = T_ADDRESS;
1627         ss.skip_array_prefix(1);  // skip one '['
1628         assert(ss.is_primitive(), "primitive type expected");
1629         in_elem_bt[i] = ss.type();
1630       } else {
1631         out_sig_bt[argc++] = in_sig_bt[i];
1632         in_elem_bt[i] = T_VOID;
1633       }
1634       if (in_sig_bt[i] != T_VOID) {
1635         assert(in_sig_bt[i] == ss.type() ||
1636                in_sig_bt[i] == T_ARRAY, "must match");
1637         ss.next();
1638       }
1639     }
1640   }
1641 
1642   // Now figure out where the args must be stored and how much stack space
1643   // they require.
1644   int out_arg_slots;
1645   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1646 
1647   // Compute framesize for the wrapper.  We need to handlize all oops in
1648   // incoming registers
1649 
1650   // Calculate the total number of stack slots we will need.
1651 
1652   // First count the abi requirement plus all of the outgoing args
1653   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1654 
1655   // Now the space for the inbound oop handle area
1656   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1657   if (is_critical_native) {
1658     // Critical natives may have to call out so they need a save area
1659     // for register arguments.
1660     int double_slots = 0;
1661     int single_slots = 0;
1662     for ( int i = 0; i < total_in_args; i++) {
1663       if (in_regs[i].first()->is_Register()) {
1664         const Register reg = in_regs[i].first()->as_Register();
1665         switch (in_sig_bt[i]) {
1666           case T_BOOLEAN:
1667           case T_BYTE:
1668           case T_SHORT:
1669           case T_CHAR:
1670           case T_INT:  single_slots++; break;
1671           case T_ARRAY:  // specific to LP64 (7145024)
1672           case T_LONG: double_slots++; break;
1673           default:  ShouldNotReachHere();
1674         }
1675       } else if (in_regs[i].first()->is_XMMRegister()) {
1676         switch (in_sig_bt[i]) {
1677           case T_FLOAT:  single_slots++; break;
1678           case T_DOUBLE: double_slots++; break;
1679           default:  ShouldNotReachHere();
1680         }
1681       } else if (in_regs[i].first()->is_FloatRegister()) {
1682         ShouldNotReachHere();
1683       }
1684     }
1685     total_save_slots = double_slots * 2 + single_slots;
1686     // align the save area
1687     if (double_slots != 0) {
1688       stack_slots = align_up(stack_slots, 2);
1689     }
1690   }
1691 
1692   int oop_handle_offset = stack_slots;
1693   stack_slots += total_save_slots;
1694 
1695   // Now any space we need for handlizing a klass if static method
1696 
1697   int klass_slot_offset = 0;
1698   int klass_offset = -1;
1699   int lock_slot_offset = 0;
1700   bool is_static = false;
1701 
1702   if (method->is_static()) {
1703     klass_slot_offset = stack_slots;
1704     stack_slots += VMRegImpl::slots_per_word;
1705     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1706     is_static = true;
1707   }
1708 
1709   // Plus a lock if needed
1710 
1711   if (method->is_synchronized()) {
1712     lock_slot_offset = stack_slots;
1713     stack_slots += VMRegImpl::slots_per_word;
1714   }
1715 
1716   // Now a place (+2) to save return values or temp during shuffling
1717   // + 4 for return address (which we own) and saved rbp
1718   stack_slots += 6;
1719 
1720   // Ok The space we have allocated will look like:
1721   //
1722   //
1723   // FP-> |                     |
1724   //      |---------------------|
1725   //      | 2 slots for moves   |
1726   //      |---------------------|
1727   //      | lock box (if sync)  |
1728   //      |---------------------| <- lock_slot_offset
1729   //      | klass (if static)   |
1730   //      |---------------------| <- klass_slot_offset
1731   //      | oopHandle area      |
1732   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1733   //      | outbound memory     |
1734   //      | based arguments     |
1735   //      |                     |
1736   //      |---------------------|
1737   //      |                     |
1738   // SP-> | out_preserved_slots |
1739   //
1740   //
1741 
1742 
1743   // Now compute actual number of stack words we need rounding to make
1744   // stack properly aligned.
1745   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1746 
1747   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1748 
1749   // First thing make an ic check to see if we should even be here
1750 
1751   // We are free to use all registers as temps without saving them and
1752   // restoring them except rbp. rbp is the only callee save register
1753   // as far as the interpreter and the compiler(s) are concerned.
1754 
1755 
1756   const Register ic_reg = rax;
1757   const Register receiver = j_rarg0;
1758 
1759   Label hit;
1760   Label exception_pending;
1761 
1762   assert_different_registers(ic_reg, receiver, rscratch1);
1763   __ verify_oop(receiver);
1764   __ load_klass(rscratch1, receiver, rscratch2);
1765   __ cmpq(ic_reg, rscratch1);
1766   __ jcc(Assembler::equal, hit);
1767 
1768   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1769 
1770   // Verified entry point must be aligned
1771   __ align(8);
1772 
1773   __ bind(hit);
1774 
1775   int vep_offset = ((intptr_t)__ pc()) - start;
1776 
1777   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1778     Label L_skip_barrier;
1779     Register klass = r10;
1780     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1781     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1782 
1783     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1784 
1785     __ bind(L_skip_barrier);
1786   }
1787 
1788 #ifdef COMPILER1
1789   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1790   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1791     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1792   }
1793 #endif // COMPILER1
1794 
1795   // The instruction at the verified entry point must be 5 bytes or longer
1796   // because it can be patched on the fly by make_non_entrant. The stack bang
1797   // instruction fits that requirement.
1798 
1799   // Generate stack overflow check
1800   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1801 
1802   // Generate a new frame for the wrapper.
1803   __ enter();
1804   // -2 because return address is already present and so is saved rbp
1805   __ subptr(rsp, stack_size - 2*wordSize);
1806 
1807   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1808   bs->nmethod_entry_barrier(masm);
1809 
1810   // Frame is now completed as far as size and linkage.
1811   int frame_complete = ((intptr_t)__ pc()) - start;
1812 
1813     if (UseRTMLocking) {
1814       // Abort RTM transaction before calling JNI
1815       // because critical section will be large and will be
1816       // aborted anyway. Also nmethod could be deoptimized.
1817       __ xabort(0);
1818     }
1819 
1820 #ifdef ASSERT
1821     {
1822       Label L;
1823       __ mov(rax, rsp);
1824       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1825       __ cmpptr(rax, rsp);
1826       __ jcc(Assembler::equal, L);
1827       __ stop("improperly aligned stack");
1828       __ bind(L);
1829     }
1830 #endif /* ASSERT */
1831 
1832 
1833   // We use r14 as the oop handle for the receiver/klass
1834   // It is callee save so it survives the call to native
1835 
1836   const Register oop_handle_reg = r14;
1837 
1838   //
1839   // We immediately shuffle the arguments so that any vm call we have to
1840   // make from here on out (sync slow path, jvmti, etc.) we will have
1841   // captured the oops from our caller and have a valid oopMap for
1842   // them.
1843 
1844   // -----------------
1845   // The Grand Shuffle
1846 
1847   // The Java calling convention is either equal (linux) or denser (win64) than the
1848   // c calling convention. However the because of the jni_env argument the c calling
1849   // convention always has at least one more (and two for static) arguments than Java.
1850   // Therefore if we move the args from java -> c backwards then we will never have
1851   // a register->register conflict and we don't have to build a dependency graph
1852   // and figure out how to break any cycles.
1853   //
1854 
1855   // Record esp-based slot for receiver on stack for non-static methods
1856   int receiver_offset = -1;
1857 
1858   // This is a trick. We double the stack slots so we can claim
1859   // the oops in the caller's frame. Since we are sure to have
1860   // more args than the caller doubling is enough to make
1861   // sure we can capture all the incoming oop args from the
1862   // caller.
1863   //
1864   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1865 
1866   // Mark location of rbp (someday)
1867   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1868 
1869   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1870   // All inbound args are referenced based on rbp and all outbound args via rsp.
1871 
1872 
1873 #ifdef ASSERT
1874   bool reg_destroyed[RegisterImpl::number_of_registers];
1875   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1876   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1877     reg_destroyed[r] = false;
1878   }
1879   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1880     freg_destroyed[f] = false;
1881   }
1882 
1883 #endif /* ASSERT */
1884 
1885   // This may iterate in two different directions depending on the
1886   // kind of native it is.  The reason is that for regular JNI natives
1887   // the incoming and outgoing registers are offset upwards and for
1888   // critical natives they are offset down.
1889   GrowableArray<int> arg_order(2 * total_in_args);
1890 
1891   VMRegPair tmp_vmreg;
1892   tmp_vmreg.set2(rbx->as_VMReg());
1893 
1894   if (!is_critical_native) {
1895     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1896       arg_order.push(i);
1897       arg_order.push(c_arg);
1898     }
1899   } else {
1900     // Compute a valid move order, using tmp_vmreg to break any cycles
1901     ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
1902   }
1903 
1904   int temploc = -1;
1905   for (int ai = 0; ai < arg_order.length(); ai += 2) {
1906     int i = arg_order.at(ai);
1907     int c_arg = arg_order.at(ai + 1);
1908     __ block_comment(err_msg("move %d -> %d", i, c_arg));
1909     if (c_arg == -1) {
1910       assert(is_critical_native, "should only be required for critical natives");
1911       // This arg needs to be moved to a temporary
1912       __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
1913       in_regs[i] = tmp_vmreg;
1914       temploc = i;
1915       continue;
1916     } else if (i == -1) {
1917       assert(is_critical_native, "should only be required for critical natives");
1918       // Read from the temporary location
1919       assert(temploc != -1, "must be valid");
1920       i = temploc;
1921       temploc = -1;
1922     }
1923 #ifdef ASSERT
1924     if (in_regs[i].first()->is_Register()) {
1925       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1926     } else if (in_regs[i].first()->is_XMMRegister()) {
1927       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1928     }
1929     if (out_regs[c_arg].first()->is_Register()) {
1930       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1931     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1932       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1933     }
1934 #endif /* ASSERT */
1935     switch (in_sig_bt[i]) {
1936       case T_ARRAY:
1937         if (is_critical_native) {
1938           unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
1939           c_arg++;
1940 #ifdef ASSERT
1941           if (out_regs[c_arg].first()->is_Register()) {
1942             reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1943           } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1944             freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1945           }
1946 #endif
1947           break;
1948         }
1949       case T_OBJECT:
1950         assert(!is_critical_native, "no oop arguments");
1951         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1952                     ((i == 0) && (!is_static)),
1953                     &receiver_offset);
1954         break;
1955       case T_VOID:
1956         break;
1957 
1958       case T_FLOAT:
1959         __ float_move(in_regs[i], out_regs[c_arg]);
1960           break;
1961 
1962       case T_DOUBLE:
1963         assert( i + 1 < total_in_args &&
1964                 in_sig_bt[i + 1] == T_VOID &&
1965                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1966         __ double_move(in_regs[i], out_regs[c_arg]);
1967         break;
1968 
1969       case T_LONG :
1970         __ long_move(in_regs[i], out_regs[c_arg]);
1971         break;
1972 
1973       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1974 
1975       default:
1976         __ move32_64(in_regs[i], out_regs[c_arg]);
1977     }
1978   }
1979 
1980   int c_arg;
1981 
1982   // Pre-load a static method's oop into r14.  Used both by locking code and
1983   // the normal JNI call code.
1984   if (!is_critical_native) {
1985     // point c_arg at the first arg that is already loaded in case we
1986     // need to spill before we call out
1987     c_arg = total_c_args - total_in_args;
1988 
1989     if (method->is_static()) {
1990 
1991       //  load oop into a register
1992       __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1993 
1994       // Now handlize the static class mirror it's known not-null.
1995       __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1996       map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1997 
1998       // Now get the handle
1999       __ lea(oop_handle_reg, Address(rsp, klass_offset));
2000       // store the klass handle as second argument
2001       __ movptr(c_rarg1, oop_handle_reg);
2002       // and protect the arg if we must spill
2003       c_arg--;
2004     }
2005   } else {
2006     // For JNI critical methods we need to save all registers in save_args.
2007     c_arg = 0;
2008   }
2009 
2010   // Change state to native (we save the return address in the thread, since it might not
2011   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2012   // points into the right code segment. It does not have to be the correct return pc.
2013   // We use the same pc/oopMap repeatedly when we call out
2014 
2015   intptr_t the_pc = (intptr_t) __ pc();
2016   oop_maps->add_gc_map(the_pc - start, map);
2017 
2018   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
2019 
2020 
2021   // We have all of the arguments setup at this point. We must not touch any register
2022   // argument registers at this point (what if we save/restore them there are no oop?
2023 
2024   {
2025     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2026     // protect the args we've loaded
2027     save_args(masm, total_c_args, c_arg, out_regs);
2028     __ mov_metadata(c_rarg1, method());
2029     __ call_VM_leaf(
2030       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2031       r15_thread, c_rarg1);
2032     restore_args(masm, total_c_args, c_arg, out_regs);
2033   }
2034 
2035   // RedefineClasses() tracing support for obsolete method entry
2036   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2037     // protect the args we've loaded
2038     save_args(masm, total_c_args, c_arg, out_regs);
2039     __ mov_metadata(c_rarg1, method());
2040     __ call_VM_leaf(
2041       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2042       r15_thread, c_rarg1);
2043     restore_args(masm, total_c_args, c_arg, out_regs);
2044   }
2045 
2046   // Lock a synchronized method
2047 
2048   // Register definitions used by locking and unlocking
2049 
2050   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2051   const Register obj_reg  = rbx;  // Will contain the oop
2052   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2053   const Register old_hdr  = r13;  // value of old header at unlock time
2054 
2055   Label slow_path_lock;
2056   Label lock_done;
2057 
2058   if (method->is_synchronized()) {
2059     assert(!is_critical_native, "unhandled");
2060 
2061 
2062     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2063 
2064     // Get the handle (the 2nd argument)
2065     __ mov(oop_handle_reg, c_rarg1);
2066 
2067     // Get address of the box
2068 
2069     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2070 
2071     // Load the oop from the handle
2072     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2073 
2074     if (UseBiasedLocking) {
2075       __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, rscratch2, false, lock_done, &slow_path_lock);
2076     }
2077 
2078     // Load immediate 1 into swap_reg %rax
2079     __ movl(swap_reg, 1);
2080 
2081     // Load (object->mark() | 1) into swap_reg %rax
2082     __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2083 
2084     // Save (object->mark() | 1) into BasicLock's displaced header
2085     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2086 
2087     // src -> dest iff dest == rax else rax <- dest
2088     __ lock();
2089     __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2090     __ jcc(Assembler::equal, lock_done);
2091 
2092     // Hmm should this move to the slow path code area???
2093 
2094     // Test if the oopMark is an obvious stack pointer, i.e.,
2095     //  1) (mark & 3) == 0, and
2096     //  2) rsp <= mark < mark + os::pagesize()
2097     // These 3 tests can be done by evaluating the following
2098     // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2099     // assuming both stack pointer and pagesize have their
2100     // least significant 2 bits clear.
2101     // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2102 
2103     __ subptr(swap_reg, rsp);
2104     __ andptr(swap_reg, 3 - os::vm_page_size());
2105 
2106     // Save the test result, for recursive case, the result is zero
2107     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2108     __ jcc(Assembler::notEqual, slow_path_lock);
2109 
2110     // Slow path will re-enter here
2111 
2112     __ bind(lock_done);
2113   }
2114 
2115   // Finally just about ready to make the JNI call
2116 
2117   // get JNIEnv* which is first argument to native
2118   if (!is_critical_native) {
2119     __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2120 
2121     // Now set thread in native
2122     __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2123   }
2124 
2125   __ call(RuntimeAddress(native_func));
2126 
2127   // Verify or restore cpu control state after JNI call
2128   __ restore_cpu_control_state_after_jni();
2129 
2130   // Unpack native results.
2131   switch (ret_type) {
2132   case T_BOOLEAN: __ c2bool(rax);            break;
2133   case T_CHAR   : __ movzwl(rax, rax);      break;
2134   case T_BYTE   : __ sign_extend_byte (rax); break;
2135   case T_SHORT  : __ sign_extend_short(rax); break;
2136   case T_INT    : /* nothing to do */        break;
2137   case T_DOUBLE :
2138   case T_FLOAT  :
2139     // Result is in xmm0 we'll save as needed
2140     break;
2141   case T_ARRAY:                 // Really a handle
2142   case T_OBJECT:                // Really a handle
2143       break; // can't de-handlize until after safepoint check
2144   case T_VOID: break;
2145   case T_LONG: break;
2146   default       : ShouldNotReachHere();
2147   }
2148 
2149   Label after_transition;
2150 
2151   // If this is a critical native, check for a safepoint or suspend request after the call.
2152   // If a safepoint is needed, transition to native, then to native_trans to handle
2153   // safepoints like the native methods that are not critical natives.
2154   if (is_critical_native) {
2155     Label needs_safepoint;
2156     __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */);
2157     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2158     __ jcc(Assembler::equal, after_transition);
2159     __ bind(needs_safepoint);
2160   }
2161 
2162   // Switch thread to "native transition" state before reading the synchronization state.
2163   // This additional state is necessary because reading and testing the synchronization
2164   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2165   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2166   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2167   //     Thread A is resumed to finish this native method, but doesn't block here since it
2168   //     didn't see any synchronization is progress, and escapes.
2169   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2170 
2171   // Force this write out before the read below
2172   __ membar(Assembler::Membar_mask_bits(
2173               Assembler::LoadLoad | Assembler::LoadStore |
2174               Assembler::StoreLoad | Assembler::StoreStore));
2175 
2176   // check for safepoint operation in progress and/or pending suspend requests
2177   {
2178     Label Continue;
2179     Label slow_path;
2180 
2181     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2182 
2183     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2184     __ jcc(Assembler::equal, Continue);
2185     __ bind(slow_path);
2186 
2187     // Don't use call_VM as it will see a possible pending exception and forward it
2188     // and never return here preventing us from clearing _last_native_pc down below.
2189     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2190     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2191     // by hand.
2192     //
2193     __ vzeroupper();
2194     save_native_result(masm, ret_type, stack_slots);
2195     __ mov(c_rarg0, r15_thread);
2196     __ mov(r12, rsp); // remember sp
2197     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2198     __ andptr(rsp, -16); // align stack as required by ABI
2199     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2200     __ mov(rsp, r12); // restore sp
2201     __ reinit_heapbase();
2202     // Restore any method result value
2203     restore_native_result(masm, ret_type, stack_slots);
2204     __ bind(Continue);
2205   }
2206 
2207   // change thread state
2208   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2209   __ bind(after_transition);
2210 
2211   Label reguard;
2212   Label reguard_done;
2213   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2214   __ jcc(Assembler::equal, reguard);
2215   __ bind(reguard_done);
2216 
2217   // native result if any is live
2218 
2219   // Unlock
2220   Label unlock_done;
2221   Label slow_path_unlock;
2222   if (method->is_synchronized()) {
2223 
2224     // Get locked oop from the handle we passed to jni
2225     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2226 
2227     Label done;
2228 
2229     if (UseBiasedLocking) {
2230       __ biased_locking_exit(obj_reg, old_hdr, done);
2231     }
2232 
2233     // Simple recursive lock?
2234 
2235     __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2236     __ jcc(Assembler::equal, done);
2237 
2238     // Must save rax if if it is live now because cmpxchg must use it
2239     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2240       save_native_result(masm, ret_type, stack_slots);
2241     }
2242 
2243 
2244     // get address of the stack lock
2245     __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2246     //  get old displaced header
2247     __ movptr(old_hdr, Address(rax, 0));
2248 
2249     // Atomic swap old header if oop still contains the stack lock
2250     __ lock();
2251     __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2252     __ jcc(Assembler::notEqual, slow_path_unlock);
2253 
2254     // slow path re-enters here
2255     __ bind(unlock_done);
2256     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2257       restore_native_result(masm, ret_type, stack_slots);
2258     }
2259 
2260     __ bind(done);
2261 
2262   }
2263   {
2264     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2265     save_native_result(masm, ret_type, stack_slots);
2266     __ mov_metadata(c_rarg1, method());
2267     __ call_VM_leaf(
2268          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2269          r15_thread, c_rarg1);
2270     restore_native_result(masm, ret_type, stack_slots);
2271   }
2272 
2273   __ reset_last_Java_frame(false);
2274 
2275   // Unbox oop result, e.g. JNIHandles::resolve value.
2276   if (is_reference_type(ret_type)) {
2277     __ resolve_jobject(rax /* value */,
2278                        r15_thread /* thread */,
2279                        rcx /* tmp */);
2280   }
2281 
2282   if (CheckJNICalls) {
2283     // clear_pending_jni_exception_check
2284     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2285   }
2286 
2287   if (!is_critical_native) {
2288     // reset handle block
2289     __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2290     __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2291   }
2292 
2293   // pop our frame
2294 
2295   __ leave();
2296 
2297   if (!is_critical_native) {
2298     // Any exception pending?
2299     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2300     __ jcc(Assembler::notEqual, exception_pending);
2301   }
2302 
2303   // Return
2304 
2305   __ ret(0);
2306 
2307   // Unexpected paths are out of line and go here
2308 
2309   if (!is_critical_native) {
2310     // forward the exception
2311     __ bind(exception_pending);
2312 
2313     // and forward the exception
2314     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2315   }
2316 
2317   // Slow path locking & unlocking
2318   if (method->is_synchronized()) {
2319 
2320     // BEGIN Slow path lock
2321     __ bind(slow_path_lock);
2322 
2323     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2324     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2325 
2326     // protect the args we've loaded
2327     save_args(masm, total_c_args, c_arg, out_regs);
2328 
2329     __ mov(c_rarg0, obj_reg);
2330     __ mov(c_rarg1, lock_reg);
2331     __ mov(c_rarg2, r15_thread);
2332 
2333     // Not a leaf but we have last_Java_frame setup as we want
2334     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2335     restore_args(masm, total_c_args, c_arg, out_regs);
2336 
2337 #ifdef ASSERT
2338     { Label L;
2339     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2340     __ jcc(Assembler::equal, L);
2341     __ stop("no pending exception allowed on exit from monitorenter");
2342     __ bind(L);
2343     }
2344 #endif
2345     __ jmp(lock_done);
2346 
2347     // END Slow path lock
2348 
2349     // BEGIN Slow path unlock
2350     __ bind(slow_path_unlock);
2351 
2352     // If we haven't already saved the native result we must save it now as xmm registers
2353     // are still exposed.
2354     __ vzeroupper();
2355     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2356       save_native_result(masm, ret_type, stack_slots);
2357     }
2358 
2359     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2360 
2361     __ mov(c_rarg0, obj_reg);
2362     __ mov(c_rarg2, r15_thread);
2363     __ mov(r12, rsp); // remember sp
2364     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2365     __ andptr(rsp, -16); // align stack as required by ABI
2366 
2367     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2368     // NOTE that obj_reg == rbx currently
2369     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2370     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2371 
2372     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2373     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2374     __ mov(rsp, r12); // restore sp
2375     __ reinit_heapbase();
2376 #ifdef ASSERT
2377     {
2378       Label L;
2379       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2380       __ jcc(Assembler::equal, L);
2381       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2382       __ bind(L);
2383     }
2384 #endif /* ASSERT */
2385 
2386     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2387 
2388     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2389       restore_native_result(masm, ret_type, stack_slots);
2390     }
2391     __ jmp(unlock_done);
2392 
2393     // END Slow path unlock
2394 
2395   } // synchronized
2396 
2397   // SLOW PATH Reguard the stack if needed
2398 
2399   __ bind(reguard);
2400   __ vzeroupper();
2401   save_native_result(masm, ret_type, stack_slots);
2402   __ mov(r12, rsp); // remember sp
2403   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2404   __ andptr(rsp, -16); // align stack as required by ABI
2405   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2406   __ mov(rsp, r12); // restore sp
2407   __ reinit_heapbase();
2408   restore_native_result(masm, ret_type, stack_slots);
2409   // and continue
2410   __ jmp(reguard_done);
2411 
2412 
2413 
2414   __ flush();
2415 
2416   nmethod *nm = nmethod::new_native_nmethod(method,
2417                                             compile_id,
2418                                             masm->code(),
2419                                             vep_offset,
2420                                             frame_complete,
2421                                             stack_slots / VMRegImpl::slots_per_word,
2422                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2423                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2424                                             oop_maps);
2425 
2426   return nm;
2427 }
2428 
2429 // this function returns the adjust size (in number of words) to a c2i adapter
2430 // activation for use during deoptimization
2431 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2432   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2433 }
2434 
2435 
2436 uint SharedRuntime::out_preserve_stack_slots() {
2437   return 0;
2438 }
2439 
2440 
2441 // Number of stack slots between incoming argument block and the start of
2442 // a new frame.  The PROLOG must add this many slots to the stack.  The
2443 // EPILOG must remove this many slots.  amd64 needs two slots for
2444 // return address.
2445 uint SharedRuntime::in_preserve_stack_slots() {
2446   return 4 + 2 * VerifyStackAtCalls;
2447 }
2448 
2449 //------------------------------generate_deopt_blob----------------------------
2450 void SharedRuntime::generate_deopt_blob() {
2451   // Allocate space for the code
2452   ResourceMark rm;
2453   // Setup code generation tools
2454   int pad = 0;
2455   if (UseAVX > 2) {
2456     pad += 1024;
2457   }
2458 #if INCLUDE_JVMCI
2459   if (EnableJVMCI) {
2460     pad += 512; // Increase the buffer size when compiling for JVMCI
2461   }
2462 #endif
2463   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2464   MacroAssembler* masm = new MacroAssembler(&buffer);
2465   int frame_size_in_words;
2466   OopMap* map = NULL;
2467   OopMapSet *oop_maps = new OopMapSet();
2468 
2469   // -------------
2470   // This code enters when returning to a de-optimized nmethod.  A return
2471   // address has been pushed on the the stack, and return values are in
2472   // registers.
2473   // If we are doing a normal deopt then we were called from the patched
2474   // nmethod from the point we returned to the nmethod. So the return
2475   // address on the stack is wrong by NativeCall::instruction_size
2476   // We will adjust the value so it looks like we have the original return
2477   // address on the stack (like when we eagerly deoptimized).
2478   // In the case of an exception pending when deoptimizing, we enter
2479   // with a return address on the stack that points after the call we patched
2480   // into the exception handler. We have the following register state from,
2481   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2482   //    rax: exception oop
2483   //    rbx: exception handler
2484   //    rdx: throwing pc
2485   // So in this case we simply jam rdx into the useless return address and
2486   // the stack looks just like we want.
2487   //
2488   // At this point we need to de-opt.  We save the argument return
2489   // registers.  We call the first C routine, fetch_unroll_info().  This
2490   // routine captures the return values and returns a structure which
2491   // describes the current frame size and the sizes of all replacement frames.
2492   // The current frame is compiled code and may contain many inlined
2493   // functions, each with their own JVM state.  We pop the current frame, then
2494   // push all the new frames.  Then we call the C routine unpack_frames() to
2495   // populate these frames.  Finally unpack_frames() returns us the new target
2496   // address.  Notice that callee-save registers are BLOWN here; they have
2497   // already been captured in the vframeArray at the time the return PC was
2498   // patched.
2499   address start = __ pc();
2500   Label cont;
2501 
2502   // Prolog for non exception case!
2503 
2504   // Save everything in sight.
2505   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2506 
2507   // Normal deoptimization.  Save exec mode for unpack_frames.
2508   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2509   __ jmp(cont);
2510 
2511   int reexecute_offset = __ pc() - start;
2512 #if INCLUDE_JVMCI && !defined(COMPILER1)
2513   if (EnableJVMCI && UseJVMCICompiler) {
2514     // JVMCI does not use this kind of deoptimization
2515     __ should_not_reach_here();
2516   }
2517 #endif
2518 
2519   // Reexecute case
2520   // return address is the pc describes what bci to do re-execute at
2521 
2522   // No need to update map as each call to save_live_registers will produce identical oopmap
2523   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2524 
2525   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2526   __ jmp(cont);
2527 
2528 #if INCLUDE_JVMCI
2529   Label after_fetch_unroll_info_call;
2530   int implicit_exception_uncommon_trap_offset = 0;
2531   int uncommon_trap_offset = 0;
2532 
2533   if (EnableJVMCI) {
2534     implicit_exception_uncommon_trap_offset = __ pc() - start;
2535 
2536     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2537     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2538 
2539     uncommon_trap_offset = __ pc() - start;
2540 
2541     // Save everything in sight.
2542     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2543     // fetch_unroll_info needs to call last_java_frame()
2544     __ set_last_Java_frame(noreg, noreg, NULL);
2545 
2546     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2547     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2548 
2549     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2550     __ mov(c_rarg0, r15_thread);
2551     __ movl(c_rarg2, r14); // exec mode
2552     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2553     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2554 
2555     __ reset_last_Java_frame(false);
2556 
2557     __ jmp(after_fetch_unroll_info_call);
2558   } // EnableJVMCI
2559 #endif // INCLUDE_JVMCI
2560 
2561   int exception_offset = __ pc() - start;
2562 
2563   // Prolog for exception case
2564 
2565   // all registers are dead at this entry point, except for rax, and
2566   // rdx which contain the exception oop and exception pc
2567   // respectively.  Set them in TLS and fall thru to the
2568   // unpack_with_exception_in_tls entry point.
2569 
2570   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2571   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2572 
2573   int exception_in_tls_offset = __ pc() - start;
2574 
2575   // new implementation because exception oop is now passed in JavaThread
2576 
2577   // Prolog for exception case
2578   // All registers must be preserved because they might be used by LinearScan
2579   // Exceptiop oop and throwing PC are passed in JavaThread
2580   // tos: stack at point of call to method that threw the exception (i.e. only
2581   // args are on the stack, no return address)
2582 
2583   // make room on stack for the return address
2584   // It will be patched later with the throwing pc. The correct value is not
2585   // available now because loading it from memory would destroy registers.
2586   __ push(0);
2587 
2588   // Save everything in sight.
2589   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2590 
2591   // Now it is safe to overwrite any register
2592 
2593   // Deopt during an exception.  Save exec mode for unpack_frames.
2594   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2595 
2596   // load throwing pc from JavaThread and patch it as the return address
2597   // of the current frame. Then clear the field in JavaThread
2598 
2599   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2600   __ movptr(Address(rbp, wordSize), rdx);
2601   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2602 
2603 #ifdef ASSERT
2604   // verify that there is really an exception oop in JavaThread
2605   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2606   __ verify_oop(rax);
2607 
2608   // verify that there is no pending exception
2609   Label no_pending_exception;
2610   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2611   __ testptr(rax, rax);
2612   __ jcc(Assembler::zero, no_pending_exception);
2613   __ stop("must not have pending exception here");
2614   __ bind(no_pending_exception);
2615 #endif
2616 
2617   __ bind(cont);
2618 
2619   // Call C code.  Need thread and this frame, but NOT official VM entry
2620   // crud.  We cannot block on this call, no GC can happen.
2621   //
2622   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2623 
2624   // fetch_unroll_info needs to call last_java_frame().
2625 
2626   __ set_last_Java_frame(noreg, noreg, NULL);
2627 #ifdef ASSERT
2628   { Label L;
2629     __ cmpptr(Address(r15_thread,
2630                     JavaThread::last_Java_fp_offset()),
2631             (int32_t)0);
2632     __ jcc(Assembler::equal, L);
2633     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2634     __ bind(L);
2635   }
2636 #endif // ASSERT
2637   __ mov(c_rarg0, r15_thread);
2638   __ movl(c_rarg1, r14); // exec_mode
2639   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2640 
2641   // Need to have an oopmap that tells fetch_unroll_info where to
2642   // find any register it might need.
2643   oop_maps->add_gc_map(__ pc() - start, map);
2644 
2645   __ reset_last_Java_frame(false);
2646 
2647 #if INCLUDE_JVMCI
2648   if (EnableJVMCI) {
2649     __ bind(after_fetch_unroll_info_call);
2650   }
2651 #endif
2652 
2653   // Load UnrollBlock* into rdi
2654   __ mov(rdi, rax);
2655 
2656   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2657    Label noException;
2658   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2659   __ jcc(Assembler::notEqual, noException);
2660   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2661   // QQQ this is useless it was NULL above
2662   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2663   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2664   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2665 
2666   __ verify_oop(rax);
2667 
2668   // Overwrite the result registers with the exception results.
2669   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2670   // I think this is useless
2671   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2672 
2673   __ bind(noException);
2674 
2675   // Only register save data is on the stack.
2676   // Now restore the result registers.  Everything else is either dead
2677   // or captured in the vframeArray.
2678   RegisterSaver::restore_result_registers(masm);
2679 
2680   // All of the register save area has been popped of the stack. Only the
2681   // return address remains.
2682 
2683   // Pop all the frames we must move/replace.
2684   //
2685   // Frame picture (youngest to oldest)
2686   // 1: self-frame (no frame link)
2687   // 2: deopting frame  (no frame link)
2688   // 3: caller of deopting frame (could be compiled/interpreted).
2689   //
2690   // Note: by leaving the return address of self-frame on the stack
2691   // and using the size of frame 2 to adjust the stack
2692   // when we are done the return to frame 3 will still be on the stack.
2693 
2694   // Pop deoptimized frame
2695   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2696   __ addptr(rsp, rcx);
2697 
2698   // rsp should be pointing at the return address to the caller (3)
2699 
2700   // Pick up the initial fp we should save
2701   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2702   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2703 
2704 #ifdef ASSERT
2705   // Compilers generate code that bang the stack by as much as the
2706   // interpreter would need. So this stack banging should never
2707   // trigger a fault. Verify that it does not on non product builds.
2708   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2709   __ bang_stack_size(rbx, rcx);
2710 #endif
2711 
2712   // Load address of array of frame pcs into rcx
2713   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2714 
2715   // Trash the old pc
2716   __ addptr(rsp, wordSize);
2717 
2718   // Load address of array of frame sizes into rsi
2719   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2720 
2721   // Load counter into rdx
2722   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2723 
2724   // Now adjust the caller's stack to make up for the extra locals
2725   // but record the original sp so that we can save it in the skeletal interpreter
2726   // frame and the stack walking of interpreter_sender will get the unextended sp
2727   // value and not the "real" sp value.
2728 
2729   const Register sender_sp = r8;
2730 
2731   __ mov(sender_sp, rsp);
2732   __ movl(rbx, Address(rdi,
2733                        Deoptimization::UnrollBlock::
2734                        caller_adjustment_offset_in_bytes()));
2735   __ subptr(rsp, rbx);
2736 
2737   // Push interpreter frames in a loop
2738   Label loop;
2739   __ bind(loop);
2740   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2741   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2742   __ pushptr(Address(rcx, 0));          // Save return address
2743   __ enter();                           // Save old & set new ebp
2744   __ subptr(rsp, rbx);                  // Prolog
2745   // This value is corrected by layout_activation_impl
2746   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2747   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2748   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2749   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2750   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2751   __ decrementl(rdx);                   // Decrement counter
2752   __ jcc(Assembler::notZero, loop);
2753   __ pushptr(Address(rcx, 0));          // Save final return address
2754 
2755   // Re-push self-frame
2756   __ enter();                           // Save old & set new ebp
2757 
2758   // Allocate a full sized register save area.
2759   // Return address and rbp are in place, so we allocate two less words.
2760   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2761 
2762   // Restore frame locals after moving the frame
2763   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2764   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2765 
2766   // Call C code.  Need thread but NOT official VM entry
2767   // crud.  We cannot block on this call, no GC can happen.  Call should
2768   // restore return values to their stack-slots with the new SP.
2769   //
2770   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2771 
2772   // Use rbp because the frames look interpreted now
2773   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2774   // Don't need the precise return PC here, just precise enough to point into this code blob.
2775   address the_pc = __ pc();
2776   __ set_last_Java_frame(noreg, rbp, the_pc);
2777 
2778   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2779   __ mov(c_rarg0, r15_thread);
2780   __ movl(c_rarg1, r14); // second arg: exec_mode
2781   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2782   // Revert SP alignment after call since we're going to do some SP relative addressing below
2783   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2784 
2785   // Set an oopmap for the call site
2786   // Use the same PC we used for the last java frame
2787   oop_maps->add_gc_map(the_pc - start,
2788                        new OopMap( frame_size_in_words, 0 ));
2789 
2790   // Clear fp AND pc
2791   __ reset_last_Java_frame(true);
2792 
2793   // Collect return values
2794   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2795   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2796   // I think this is useless (throwing pc?)
2797   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2798 
2799   // Pop self-frame.
2800   __ leave();                           // Epilog
2801 
2802   // Jump to interpreter
2803   __ ret(0);
2804 
2805   // Make sure all code is generated
2806   masm->flush();
2807 
2808   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2809   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2810 #if INCLUDE_JVMCI
2811   if (EnableJVMCI) {
2812     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2813     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2814   }
2815 #endif
2816 }
2817 
2818 #ifdef COMPILER2
2819 //------------------------------generate_uncommon_trap_blob--------------------
2820 void SharedRuntime::generate_uncommon_trap_blob() {
2821   // Allocate space for the code
2822   ResourceMark rm;
2823   // Setup code generation tools
2824   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2825   MacroAssembler* masm = new MacroAssembler(&buffer);
2826 
2827   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2828 
2829   address start = __ pc();
2830 
2831   if (UseRTMLocking) {
2832     // Abort RTM transaction before possible nmethod deoptimization.
2833     __ xabort(0);
2834   }
2835 
2836   // Push self-frame.  We get here with a return address on the
2837   // stack, so rsp is 8-byte aligned until we allocate our frame.
2838   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2839 
2840   // No callee saved registers. rbp is assumed implicitly saved
2841   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2842 
2843   // compiler left unloaded_class_index in j_rarg0 move to where the
2844   // runtime expects it.
2845   __ movl(c_rarg1, j_rarg0);
2846 
2847   __ set_last_Java_frame(noreg, noreg, NULL);
2848 
2849   // Call C code.  Need thread but NOT official VM entry
2850   // crud.  We cannot block on this call, no GC can happen.  Call should
2851   // capture callee-saved registers as well as return values.
2852   // Thread is in rdi already.
2853   //
2854   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2855 
2856   __ mov(c_rarg0, r15_thread);
2857   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2858   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2859 
2860   // Set an oopmap for the call site
2861   OopMapSet* oop_maps = new OopMapSet();
2862   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2863 
2864   // location of rbp is known implicitly by the frame sender code
2865 
2866   oop_maps->add_gc_map(__ pc() - start, map);
2867 
2868   __ reset_last_Java_frame(false);
2869 
2870   // Load UnrollBlock* into rdi
2871   __ mov(rdi, rax);
2872 
2873 #ifdef ASSERT
2874   { Label L;
2875     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2876             (int32_t)Deoptimization::Unpack_uncommon_trap);
2877     __ jcc(Assembler::equal, L);
2878     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2879     __ bind(L);
2880   }
2881 #endif
2882 
2883   // Pop all the frames we must move/replace.
2884   //
2885   // Frame picture (youngest to oldest)
2886   // 1: self-frame (no frame link)
2887   // 2: deopting frame  (no frame link)
2888   // 3: caller of deopting frame (could be compiled/interpreted).
2889 
2890   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2891   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2892 
2893   // Pop deoptimized frame (int)
2894   __ movl(rcx, Address(rdi,
2895                        Deoptimization::UnrollBlock::
2896                        size_of_deoptimized_frame_offset_in_bytes()));
2897   __ addptr(rsp, rcx);
2898 
2899   // rsp should be pointing at the return address to the caller (3)
2900 
2901   // Pick up the initial fp we should save
2902   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2903   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2904 
2905 #ifdef ASSERT
2906   // Compilers generate code that bang the stack by as much as the
2907   // interpreter would need. So this stack banging should never
2908   // trigger a fault. Verify that it does not on non product builds.
2909   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2910   __ bang_stack_size(rbx, rcx);
2911 #endif
2912 
2913   // Load address of array of frame pcs into rcx (address*)
2914   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2915 
2916   // Trash the return pc
2917   __ addptr(rsp, wordSize);
2918 
2919   // Load address of array of frame sizes into rsi (intptr_t*)
2920   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2921 
2922   // Counter
2923   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2924 
2925   // Now adjust the caller's stack to make up for the extra locals but
2926   // record the original sp so that we can save it in the skeletal
2927   // interpreter frame and the stack walking of interpreter_sender
2928   // will get the unextended sp value and not the "real" sp value.
2929 
2930   const Register sender_sp = r8;
2931 
2932   __ mov(sender_sp, rsp);
2933   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2934   __ subptr(rsp, rbx);
2935 
2936   // Push interpreter frames in a loop
2937   Label loop;
2938   __ bind(loop);
2939   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2940   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2941   __ pushptr(Address(rcx, 0));     // Save return address
2942   __ enter();                      // Save old & set new rbp
2943   __ subptr(rsp, rbx);             // Prolog
2944   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2945             sender_sp);            // Make it walkable
2946   // This value is corrected by layout_activation_impl
2947   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2948   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
2949   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
2950   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
2951   __ decrementl(rdx);              // Decrement counter
2952   __ jcc(Assembler::notZero, loop);
2953   __ pushptr(Address(rcx, 0));     // Save final return address
2954 
2955   // Re-push self-frame
2956   __ enter();                 // Save old & set new rbp
2957   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2958                               // Prolog
2959 
2960   // Use rbp because the frames look interpreted now
2961   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2962   // Don't need the precise return PC here, just precise enough to point into this code blob.
2963   address the_pc = __ pc();
2964   __ set_last_Java_frame(noreg, rbp, the_pc);
2965 
2966   // Call C code.  Need thread but NOT official VM entry
2967   // crud.  We cannot block on this call, no GC can happen.  Call should
2968   // restore return values to their stack-slots with the new SP.
2969   // Thread is in rdi already.
2970   //
2971   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2972 
2973   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2974   __ mov(c_rarg0, r15_thread);
2975   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2976   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2977 
2978   // Set an oopmap for the call site
2979   // Use the same PC we used for the last java frame
2980   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2981 
2982   // Clear fp AND pc
2983   __ reset_last_Java_frame(true);
2984 
2985   // Pop self-frame.
2986   __ leave();                 // Epilog
2987 
2988   // Jump to interpreter
2989   __ ret(0);
2990 
2991   // Make sure all code is generated
2992   masm->flush();
2993 
2994   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
2995                                                  SimpleRuntimeFrame::framesize >> 1);
2996 }
2997 #endif // COMPILER2
2998 
2999 //------------------------------generate_handler_blob------
3000 //
3001 // Generate a special Compile2Runtime blob that saves all registers,
3002 // and setup oopmap.
3003 //
3004 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3005   assert(StubRoutines::forward_exception_entry() != NULL,
3006          "must be generated before");
3007 
3008   ResourceMark rm;
3009   OopMapSet *oop_maps = new OopMapSet();
3010   OopMap* map;
3011 
3012   // Allocate space for the code.  Setup code generation tools.
3013   CodeBuffer buffer("handler_blob", 2048, 1024);
3014   MacroAssembler* masm = new MacroAssembler(&buffer);
3015 
3016   address start   = __ pc();
3017   address call_pc = NULL;
3018   int frame_size_in_words;
3019   bool cause_return = (poll_type == POLL_AT_RETURN);
3020   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3021 
3022   if (UseRTMLocking) {
3023     // Abort RTM transaction before calling runtime
3024     // because critical section will be large and will be
3025     // aborted anyway. Also nmethod could be deoptimized.
3026     __ xabort(0);
3027   }
3028 
3029   // Make room for return address (or push it again)
3030   if (!cause_return) {
3031     __ push(rbx);
3032   }
3033 
3034   // Save registers, fpu state, and flags
3035   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3036 
3037   // The following is basically a call_VM.  However, we need the precise
3038   // address of the call in order to generate an oopmap. Hence, we do all the
3039   // work outselves.
3040 
3041   __ set_last_Java_frame(noreg, noreg, NULL);
3042 
3043   // The return address must always be correct so that frame constructor never
3044   // sees an invalid pc.
3045 
3046   if (!cause_return) {
3047     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3048     // Additionally, rbx is a callee saved register and we can look at it later to determine
3049     // if someone changed the return address for us!
3050     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3051     __ movptr(Address(rbp, wordSize), rbx);
3052   }
3053 
3054   // Do the call
3055   __ mov(c_rarg0, r15_thread);
3056   __ call(RuntimeAddress(call_ptr));
3057 
3058   // Set an oopmap for the call site.  This oopmap will map all
3059   // oop-registers and debug-info registers as callee-saved.  This
3060   // will allow deoptimization at this safepoint to find all possible
3061   // debug-info recordings, as well as let GC find all oops.
3062 
3063   oop_maps->add_gc_map( __ pc() - start, map);
3064 
3065   Label noException;
3066 
3067   __ reset_last_Java_frame(false);
3068 
3069   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3070   __ jcc(Assembler::equal, noException);
3071 
3072   // Exception pending
3073 
3074   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3075 
3076   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3077 
3078   // No exception case
3079   __ bind(noException);
3080 
3081   Label no_adjust;
3082 #ifdef ASSERT
3083   Label bail;
3084 #endif
3085   if (!cause_return) {
3086     Label no_prefix, not_special;
3087 
3088     // If our stashed return pc was modified by the runtime we avoid touching it
3089     __ cmpptr(rbx, Address(rbp, wordSize));
3090     __ jccb(Assembler::notEqual, no_adjust);
3091 
3092     // Skip over the poll instruction.
3093     // See NativeInstruction::is_safepoint_poll()
3094     // Possible encodings:
3095     //      85 00       test   %eax,(%rax)
3096     //      85 01       test   %eax,(%rcx)
3097     //      85 02       test   %eax,(%rdx)
3098     //      85 03       test   %eax,(%rbx)
3099     //      85 06       test   %eax,(%rsi)
3100     //      85 07       test   %eax,(%rdi)
3101     //
3102     //   41 85 00       test   %eax,(%r8)
3103     //   41 85 01       test   %eax,(%r9)
3104     //   41 85 02       test   %eax,(%r10)
3105     //   41 85 03       test   %eax,(%r11)
3106     //   41 85 06       test   %eax,(%r14)
3107     //   41 85 07       test   %eax,(%r15)
3108     //
3109     //      85 04 24    test   %eax,(%rsp)
3110     //   41 85 04 24    test   %eax,(%r12)
3111     //      85 45 00    test   %eax,0x0(%rbp)
3112     //   41 85 45 00    test   %eax,0x0(%r13)
3113 
3114     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3115     __ jcc(Assembler::notEqual, no_prefix);
3116     __ addptr(rbx, 1);
3117     __ bind(no_prefix);
3118 #ifdef ASSERT
3119     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3120 #endif
3121     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3122     // r12/rsp 0x04
3123     // r13/rbp 0x05
3124     __ movzbq(rcx, Address(rbx, 1));
3125     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3126     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3127     __ cmpptr(rcx, 1);
3128     __ jcc(Assembler::above, not_special);
3129     __ addptr(rbx, 1);
3130     __ bind(not_special);
3131 #ifdef ASSERT
3132     // Verify the correct encoding of the poll we're about to skip.
3133     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3134     __ jcc(Assembler::notEqual, bail);
3135     // Mask out the modrm bits
3136     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3137     // rax encodes to 0, so if the bits are nonzero it's incorrect
3138     __ jcc(Assembler::notZero, bail);
3139 #endif
3140     // Adjust return pc forward to step over the safepoint poll instruction
3141     __ addptr(rbx, 2);
3142     __ movptr(Address(rbp, wordSize), rbx);
3143   }
3144 
3145   __ bind(no_adjust);
3146   // Normal exit, restore registers and exit.
3147   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3148   __ ret(0);
3149 
3150 #ifdef ASSERT
3151   __ bind(bail);
3152   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3153 #endif
3154 
3155   // Make sure all code is generated
3156   masm->flush();
3157 
3158   // Fill-out other meta info
3159   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3160 }
3161 
3162 //
3163 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3164 //
3165 // Generate a stub that calls into vm to find out the proper destination
3166 // of a java call. All the argument registers are live at this point
3167 // but since this is generic code we don't know what they are and the caller
3168 // must do any gc of the args.
3169 //
3170 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3171   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3172 
3173   // allocate space for the code
3174   ResourceMark rm;
3175 
3176   CodeBuffer buffer(name, 1000, 512);
3177   MacroAssembler* masm                = new MacroAssembler(&buffer);
3178 
3179   int frame_size_in_words;
3180 
3181   OopMapSet *oop_maps = new OopMapSet();
3182   OopMap* map = NULL;
3183 
3184   int start = __ offset();
3185 
3186   // No need to save vector registers since they are caller-saved anyway.
3187   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3188 
3189   int frame_complete = __ offset();
3190 
3191   __ set_last_Java_frame(noreg, noreg, NULL);
3192 
3193   __ mov(c_rarg0, r15_thread);
3194 
3195   __ call(RuntimeAddress(destination));
3196 
3197 
3198   // Set an oopmap for the call site.
3199   // We need this not only for callee-saved registers, but also for volatile
3200   // registers that the compiler might be keeping live across a safepoint.
3201 
3202   oop_maps->add_gc_map( __ offset() - start, map);
3203 
3204   // rax contains the address we are going to jump to assuming no exception got installed
3205 
3206   // clear last_Java_sp
3207   __ reset_last_Java_frame(false);
3208   // check for pending exceptions
3209   Label pending;
3210   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3211   __ jcc(Assembler::notEqual, pending);
3212 
3213   // get the returned Method*
3214   __ get_vm_result_2(rbx, r15_thread);
3215   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3216 
3217   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3218 
3219   RegisterSaver::restore_live_registers(masm);
3220 
3221   // We are back the the original state on entry and ready to go.
3222 
3223   __ jmp(rax);
3224 
3225   // Pending exception after the safepoint
3226 
3227   __ bind(pending);
3228 
3229   RegisterSaver::restore_live_registers(masm);
3230 
3231   // exception pending => remove activation and forward to exception handler
3232 
3233   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3234 
3235   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3236   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3237 
3238   // -------------
3239   // make sure all code is generated
3240   masm->flush();
3241 
3242   // return the  blob
3243   // frame_size_words or bytes??
3244   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3245 }
3246 
3247 #ifdef COMPILER2
3248 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3249 
3250 class NativeInvokerGenerator : public StubCodeGenerator {
3251   address _call_target;
3252   int _shadow_space_bytes;
3253 
3254   const GrowableArray<VMReg>& _input_registers;
3255   const GrowableArray<VMReg>& _output_registers;
3256 
3257   int _frame_complete;
3258   int _framesize;
3259   OopMapSet* _oop_maps;
3260 public:
3261   NativeInvokerGenerator(CodeBuffer* buffer,
3262                          address call_target,
3263                          int shadow_space_bytes,
3264                          const GrowableArray<VMReg>& input_registers,
3265                          const GrowableArray<VMReg>& output_registers)
3266    : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3267      _call_target(call_target),
3268      _shadow_space_bytes(shadow_space_bytes),
3269      _input_registers(input_registers),
3270      _output_registers(output_registers),
3271      _frame_complete(0),
3272      _framesize(0),
3273      _oop_maps(NULL) {
3274     assert(_output_registers.length() <= 1
3275            || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3276 
3277   }
3278 
3279   void generate();
3280 
3281   int spill_size_in_bytes() const {
3282     if (_output_registers.length() == 0) {
3283       return 0;
3284     }
3285     VMReg reg = _output_registers.at(0);
3286     assert(reg->is_reg(), "must be a register");
3287     if (reg->is_Register()) {
3288       return 8;
3289     } else if (reg->is_XMMRegister()) {
3290       if (UseAVX >= 3) {
3291         return 64;
3292       } else if (UseAVX >= 1) {
3293         return 32;
3294       } else {
3295         return 16;
3296       }
3297     } else {
3298       ShouldNotReachHere();
3299     }
3300     return 0;
3301   }
3302 
3303   void spill_out_registers() {
3304     if (_output_registers.length() == 0) {
3305       return;
3306     }
3307     VMReg reg = _output_registers.at(0);
3308     assert(reg->is_reg(), "must be a register");
3309     MacroAssembler* masm = _masm;
3310     if (reg->is_Register()) {
3311       __ movptr(Address(rsp, 0), reg->as_Register());
3312     } else if (reg->is_XMMRegister()) {
3313       if (UseAVX >= 3) {
3314         __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3315       } else if (UseAVX >= 1) {
3316         __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3317       } else {
3318         __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3319       }
3320     } else {
3321       ShouldNotReachHere();
3322     }
3323   }
3324 
3325   void fill_out_registers() {
3326     if (_output_registers.length() == 0) {
3327       return;
3328     }
3329     VMReg reg = _output_registers.at(0);
3330     assert(reg->is_reg(), "must be a register");
3331     MacroAssembler* masm = _masm;
3332     if (reg->is_Register()) {
3333       __ movptr(reg->as_Register(), Address(rsp, 0));
3334     } else if (reg->is_XMMRegister()) {
3335       if (UseAVX >= 3) {
3336         __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3337       } else if (UseAVX >= 1) {
3338         __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3339       } else {
3340         __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3341       }
3342     } else {
3343       ShouldNotReachHere();
3344     }
3345   }
3346 
3347   int frame_complete() const {
3348     return _frame_complete;
3349   }
3350 
3351   int framesize() const {
3352     return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3353   }
3354 
3355   OopMapSet* oop_maps() const {
3356     return _oop_maps;
3357   }
3358 
3359 private:
3360 #ifdef ASSERT
3361 bool target_uses_register(VMReg reg) {
3362   return _input_registers.contains(reg) || _output_registers.contains(reg);
3363 }
3364 #endif
3365 };
3366 
3367 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3368                                                 int shadow_space_bytes,
3369                                                 const GrowableArray<VMReg>& input_registers,
3370                                                 const GrowableArray<VMReg>& output_registers) {
3371   int locs_size  = 64;
3372   CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3373   NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3374   g.generate();
3375   code.log_section_sizes("nep_invoker_blob");
3376 
3377   RuntimeStub* stub =
3378     RuntimeStub::new_runtime_stub("nep_invoker_blob",
3379                                   &code,
3380                                   g.frame_complete(),
3381                                   g.framesize(),
3382                                   g.oop_maps(), false);
3383   return stub;
3384 }
3385 
3386 void NativeInvokerGenerator::generate() {
3387   assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3388 
3389   enum layout {
3390     rbp_off,
3391     rbp_off2,
3392     return_off,
3393     return_off2,
3394     framesize // inclusive of return address
3395   };
3396 
3397   _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3398   assert(is_even(_framesize/2), "sp not 16-byte aligned");
3399 
3400   _oop_maps  = new OopMapSet();
3401   MacroAssembler* masm = _masm;
3402 
3403   address start = __ pc();
3404 
3405   __ enter();
3406 
3407   // return address and rbp are already in place
3408   __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3409 
3410   _frame_complete = __ pc() - start;
3411 
3412   address the_pc = __ pc();
3413 
3414   __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3415   OopMap* map = new OopMap(_framesize, 0);
3416   _oop_maps->add_gc_map(the_pc - start, map);
3417 
3418   // State transition
3419   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3420 
3421   __ call(RuntimeAddress(_call_target));
3422 
3423   __ restore_cpu_control_state_after_jni();
3424 
3425   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3426 
3427   // Force this write out before the read below
3428   __ membar(Assembler::Membar_mask_bits(
3429           Assembler::LoadLoad | Assembler::LoadStore |
3430           Assembler::StoreLoad | Assembler::StoreStore));
3431 
3432   Label L_after_safepoint_poll;
3433   Label L_safepoint_poll_slow_path;
3434 
3435   __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3436   __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3437   __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3438 
3439   __ bind(L_after_safepoint_poll);
3440 
3441   // change thread state
3442   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3443 
3444   __ block_comment("reguard stack check");
3445   Label L_reguard;
3446   Label L_after_reguard;
3447   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3448   __ jcc(Assembler::equal, L_reguard);
3449   __ bind(L_after_reguard);
3450 
3451   __ reset_last_Java_frame(r15_thread, true);
3452 
3453   __ leave(); // required for proper stackwalking of RuntimeStub frame
3454   __ ret(0);
3455 
3456   //////////////////////////////////////////////////////////////////////////////
3457 
3458   __ block_comment("{ L_safepoint_poll_slow_path");
3459   __ bind(L_safepoint_poll_slow_path);
3460   __ vzeroupper();
3461 
3462   spill_out_registers();
3463 
3464   __ mov(c_rarg0, r15_thread);
3465   __ mov(r12, rsp); // remember sp
3466   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3467   __ andptr(rsp, -16); // align stack as required by ABI
3468   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3469   __ mov(rsp, r12); // restore sp
3470   __ reinit_heapbase();
3471 
3472   fill_out_registers();
3473 
3474   __ jmp(L_after_safepoint_poll);
3475   __ block_comment("} L_safepoint_poll_slow_path");
3476 
3477   //////////////////////////////////////////////////////////////////////////////
3478 
3479   __ block_comment("{ L_reguard");
3480   __ bind(L_reguard);
3481   __ vzeroupper();
3482 
3483   spill_out_registers();
3484 
3485   __ mov(r12, rsp); // remember sp
3486   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3487   __ andptr(rsp, -16); // align stack as required by ABI
3488   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3489   __ mov(rsp, r12); // restore sp
3490   __ reinit_heapbase();
3491 
3492   fill_out_registers();
3493 
3494   __ jmp(L_after_reguard);
3495 
3496   __ block_comment("} L_reguard");
3497 
3498   //////////////////////////////////////////////////////////////////////////////
3499 
3500   __ flush();
3501 }
3502 #endif // COMPILER2
3503 
3504 //------------------------------Montgomery multiplication------------------------
3505 //
3506 
3507 #ifndef _WINDOWS
3508 
3509 // Subtract 0:b from carry:a.  Return carry.
3510 static julong
3511 sub(julong a[], julong b[], julong carry, long len) {
3512   long long i = 0, cnt = len;
3513   julong tmp;
3514   asm volatile("clc; "
3515                "0: ; "
3516                "mov (%[b], %[i], 8), %[tmp]; "
3517                "sbb %[tmp], (%[a], %[i], 8); "
3518                "inc %[i]; dec %[cnt]; "
3519                "jne 0b; "
3520                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3521                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3522                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3523                : "memory");
3524   return tmp;
3525 }
3526 
3527 // Multiply (unsigned) Long A by Long B, accumulating the double-
3528 // length result into the accumulator formed of T0, T1, and T2.
3529 #define MACC(A, B, T0, T1, T2)                                  \
3530 do {                                                            \
3531   unsigned long hi, lo;                                         \
3532   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3533            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3534            : "r"(A), "a"(B) : "cc");                            \
3535  } while(0)
3536 
3537 // As above, but add twice the double-length result into the
3538 // accumulator.
3539 #define MACC2(A, B, T0, T1, T2)                                 \
3540 do {                                                            \
3541   unsigned long hi, lo;                                         \
3542   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3543            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3544            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3545            : "r"(A), "a"(B) : "cc");                            \
3546  } while(0)
3547 
3548 #else //_WINDOWS
3549 
3550 static julong
3551 sub(julong a[], julong b[], julong carry, long len) {
3552   long i;
3553   julong tmp;
3554   unsigned char c = 1;
3555   for (i = 0; i < len; i++) {
3556     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3557     a[i] = tmp;
3558   }
3559   c = _addcarry_u64(c, carry, ~0, &tmp);
3560   return tmp;
3561 }
3562 
3563 // Multiply (unsigned) Long A by Long B, accumulating the double-
3564 // length result into the accumulator formed of T0, T1, and T2.
3565 #define MACC(A, B, T0, T1, T2)                          \
3566 do {                                                    \
3567   julong hi, lo;                            \
3568   lo = _umul128(A, B, &hi);                             \
3569   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3570   c = _addcarry_u64(c, hi, T1, &T1);                    \
3571   _addcarry_u64(c, T2, 0, &T2);                         \
3572  } while(0)
3573 
3574 // As above, but add twice the double-length result into the
3575 // accumulator.
3576 #define MACC2(A, B, T0, T1, T2)                         \
3577 do {                                                    \
3578   julong hi, lo;                            \
3579   lo = _umul128(A, B, &hi);                             \
3580   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3581   c = _addcarry_u64(c, hi, T1, &T1);                    \
3582   _addcarry_u64(c, T2, 0, &T2);                         \
3583   c = _addcarry_u64(0, lo, T0, &T0);                    \
3584   c = _addcarry_u64(c, hi, T1, &T1);                    \
3585   _addcarry_u64(c, T2, 0, &T2);                         \
3586  } while(0)
3587 
3588 #endif //_WINDOWS
3589 
3590 // Fast Montgomery multiplication.  The derivation of the algorithm is
3591 // in  A Cryptographic Library for the Motorola DSP56000,
3592 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3593 
3594 static void NOINLINE
3595 montgomery_multiply(julong a[], julong b[], julong n[],
3596                     julong m[], julong inv, int len) {
3597   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3598   int i;
3599 
3600   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3601 
3602   for (i = 0; i < len; i++) {
3603     int j;
3604     for (j = 0; j < i; j++) {
3605       MACC(a[j], b[i-j], t0, t1, t2);
3606       MACC(m[j], n[i-j], t0, t1, t2);
3607     }
3608     MACC(a[i], b[0], t0, t1, t2);
3609     m[i] = t0 * inv;
3610     MACC(m[i], n[0], t0, t1, t2);
3611 
3612     assert(t0 == 0, "broken Montgomery multiply");
3613 
3614     t0 = t1; t1 = t2; t2 = 0;
3615   }
3616 
3617   for (i = len; i < 2*len; i++) {
3618     int j;
3619     for (j = i-len+1; j < len; j++) {
3620       MACC(a[j], b[i-j], t0, t1, t2);
3621       MACC(m[j], n[i-j], t0, t1, t2);
3622     }
3623     m[i-len] = t0;
3624     t0 = t1; t1 = t2; t2 = 0;
3625   }
3626 
3627   while (t0)
3628     t0 = sub(m, n, t0, len);
3629 }
3630 
3631 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3632 // multiplies so it should be up to 25% faster than Montgomery
3633 // multiplication.  However, its loop control is more complex and it
3634 // may actually run slower on some machines.
3635 
3636 static void NOINLINE
3637 montgomery_square(julong a[], julong n[],
3638                   julong m[], julong inv, int len) {
3639   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3640   int i;
3641 
3642   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3643 
3644   for (i = 0; i < len; i++) {
3645     int j;
3646     int end = (i+1)/2;
3647     for (j = 0; j < end; j++) {
3648       MACC2(a[j], a[i-j], t0, t1, t2);
3649       MACC(m[j], n[i-j], t0, t1, t2);
3650     }
3651     if ((i & 1) == 0) {
3652       MACC(a[j], a[j], t0, t1, t2);
3653     }
3654     for (; j < i; j++) {
3655       MACC(m[j], n[i-j], t0, t1, t2);
3656     }
3657     m[i] = t0 * inv;
3658     MACC(m[i], n[0], t0, t1, t2);
3659 
3660     assert(t0 == 0, "broken Montgomery square");
3661 
3662     t0 = t1; t1 = t2; t2 = 0;
3663   }
3664 
3665   for (i = len; i < 2*len; i++) {
3666     int start = i-len+1;
3667     int end = start + (len - start)/2;
3668     int j;
3669     for (j = start; j < end; j++) {
3670       MACC2(a[j], a[i-j], t0, t1, t2);
3671       MACC(m[j], n[i-j], t0, t1, t2);
3672     }
3673     if ((i & 1) == 0) {
3674       MACC(a[j], a[j], t0, t1, t2);
3675     }
3676     for (; j < len; j++) {
3677       MACC(m[j], n[i-j], t0, t1, t2);
3678     }
3679     m[i-len] = t0;
3680     t0 = t1; t1 = t2; t2 = 0;
3681   }
3682 
3683   while (t0)
3684     t0 = sub(m, n, t0, len);
3685 }
3686 
3687 // Swap words in a longword.
3688 static julong swap(julong x) {
3689   return (x << 32) | (x >> 32);
3690 }
3691 
3692 // Copy len longwords from s to d, word-swapping as we go.  The
3693 // destination array is reversed.
3694 static void reverse_words(julong *s, julong *d, int len) {
3695   d += len;
3696   while(len-- > 0) {
3697     d--;
3698     *d = swap(*s);
3699     s++;
3700   }
3701 }
3702 
3703 // The threshold at which squaring is advantageous was determined
3704 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3705 #define MONTGOMERY_SQUARING_THRESHOLD 64
3706 
3707 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3708                                         jint len, jlong inv,
3709                                         jint *m_ints) {
3710   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3711   int longwords = len/2;
3712 
3713   // Make very sure we don't use so much space that the stack might
3714   // overflow.  512 jints corresponds to an 16384-bit integer and
3715   // will use here a total of 8k bytes of stack space.
3716   int divisor = sizeof(julong) * 4;
3717   guarantee(longwords <= 8192 / divisor, "must be");
3718   int total_allocation = longwords * sizeof (julong) * 4;
3719   julong *scratch = (julong *)alloca(total_allocation);
3720 
3721   // Local scratch arrays
3722   julong
3723     *a = scratch + 0 * longwords,
3724     *b = scratch + 1 * longwords,
3725     *n = scratch + 2 * longwords,
3726     *m = scratch + 3 * longwords;
3727 
3728   reverse_words((julong *)a_ints, a, longwords);
3729   reverse_words((julong *)b_ints, b, longwords);
3730   reverse_words((julong *)n_ints, n, longwords);
3731 
3732   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3733 
3734   reverse_words(m, (julong *)m_ints, longwords);
3735 }
3736 
3737 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3738                                       jint len, jlong inv,
3739                                       jint *m_ints) {
3740   assert(len % 2 == 0, "array length in montgomery_square must be even");
3741   int longwords = len/2;
3742 
3743   // Make very sure we don't use so much space that the stack might
3744   // overflow.  512 jints corresponds to an 16384-bit integer and
3745   // will use here a total of 6k bytes of stack space.
3746   int divisor = sizeof(julong) * 3;
3747   guarantee(longwords <= (8192 / divisor), "must be");
3748   int total_allocation = longwords * sizeof (julong) * 3;
3749   julong *scratch = (julong *)alloca(total_allocation);
3750 
3751   // Local scratch arrays
3752   julong
3753     *a = scratch + 0 * longwords,
3754     *n = scratch + 1 * longwords,
3755     *m = scratch + 2 * longwords;
3756 
3757   reverse_words((julong *)a_ints, a, longwords);
3758   reverse_words((julong *)n_ints, n, longwords);
3759 
3760   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3761     ::montgomery_square(a, n, m, (julong)inv, longwords);
3762   } else {
3763     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3764   }
3765 
3766   reverse_words(m, (julong *)m_ints, longwords);
3767 }
3768 
3769 #ifdef COMPILER2
3770 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3771 //
3772 //------------------------------generate_exception_blob---------------------------
3773 // creates exception blob at the end
3774 // Using exception blob, this code is jumped from a compiled method.
3775 // (see emit_exception_handler in x86_64.ad file)
3776 //
3777 // Given an exception pc at a call we call into the runtime for the
3778 // handler in this method. This handler might merely restore state
3779 // (i.e. callee save registers) unwind the frame and jump to the
3780 // exception handler for the nmethod if there is no Java level handler
3781 // for the nmethod.
3782 //
3783 // This code is entered with a jmp.
3784 //
3785 // Arguments:
3786 //   rax: exception oop
3787 //   rdx: exception pc
3788 //
3789 // Results:
3790 //   rax: exception oop
3791 //   rdx: exception pc in caller or ???
3792 //   destination: exception handler of caller
3793 //
3794 // Note: the exception pc MUST be at a call (precise debug information)
3795 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3796 //
3797 
3798 void OptoRuntime::generate_exception_blob() {
3799   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3800   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3801   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3802 
3803   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3804 
3805   // Allocate space for the code
3806   ResourceMark rm;
3807   // Setup code generation tools
3808   CodeBuffer buffer("exception_blob", 2048, 1024);
3809   MacroAssembler* masm = new MacroAssembler(&buffer);
3810 
3811 
3812   address start = __ pc();
3813 
3814   // Exception pc is 'return address' for stack walker
3815   __ push(rdx);
3816   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3817 
3818   // Save callee-saved registers.  See x86_64.ad.
3819 
3820   // rbp is an implicitly saved callee saved register (i.e., the calling
3821   // convention will save/restore it in the prolog/epilog). Other than that
3822   // there are no callee save registers now that adapter frames are gone.
3823 
3824   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3825 
3826   // Store exception in Thread object. We cannot pass any arguments to the
3827   // handle_exception call, since we do not want to make any assumption
3828   // about the size of the frame where the exception happened in.
3829   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3830   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3831   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3832 
3833   // This call does all the hard work.  It checks if an exception handler
3834   // exists in the method.
3835   // If so, it returns the handler address.
3836   // If not, it prepares for stack-unwinding, restoring the callee-save
3837   // registers of the frame being removed.
3838   //
3839   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3840 
3841   // At a method handle call, the stack may not be properly aligned
3842   // when returning with an exception.
3843   address the_pc = __ pc();
3844   __ set_last_Java_frame(noreg, noreg, the_pc);
3845   __ mov(c_rarg0, r15_thread);
3846   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3847   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3848 
3849   // Set an oopmap for the call site.  This oopmap will only be used if we
3850   // are unwinding the stack.  Hence, all locations will be dead.
3851   // Callee-saved registers will be the same as the frame above (i.e.,
3852   // handle_exception_stub), since they were restored when we got the
3853   // exception.
3854 
3855   OopMapSet* oop_maps = new OopMapSet();
3856 
3857   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3858 
3859   __ reset_last_Java_frame(false);
3860 
3861   // Restore callee-saved registers
3862 
3863   // rbp is an implicitly saved callee-saved register (i.e., the calling
3864   // convention will save restore it in prolog/epilog) Other than that
3865   // there are no callee save registers now that adapter frames are gone.
3866 
3867   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3868 
3869   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3870   __ pop(rdx);                  // No need for exception pc anymore
3871 
3872   // rax: exception handler
3873 
3874   // We have a handler in rax (could be deopt blob).
3875   __ mov(r8, rax);
3876 
3877   // Get the exception oop
3878   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3879   // Get the exception pc in case we are deoptimized
3880   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3881 #ifdef ASSERT
3882   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3883   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3884 #endif
3885   // Clear the exception oop so GC no longer processes it as a root.
3886   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3887 
3888   // rax: exception oop
3889   // r8:  exception handler
3890   // rdx: exception pc
3891   // Jump to handler
3892 
3893   __ jmp(r8);
3894 
3895   // Make sure all code is generated
3896   masm->flush();
3897 
3898   // Set exception blob
3899   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3900 }
3901 #endif // COMPILER2
3902 
3903 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
3904                                        int total_in_args, const VMRegPair* in_regs,
3905                                        int total_out_args, VMRegPair* out_regs,
3906                                        GrowableArray<int>& arg_order,
3907                                        VMRegPair tmp_vmreg) {
3908   ComputeMoveOrder order(total_in_args, in_regs,
3909                          total_out_args, out_regs,
3910                          in_sig_bt, arg_order, tmp_vmreg);
3911 }