1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/icBuffer.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/disassembler.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "logging/logStream.hpp"
  44 #include "memory/resourceArea.hpp"
  45 #include "memory/universe.hpp"
  46 #include "oops/compiledICHolder.hpp"
  47 #include "oops/klass.inline.hpp"
  48 #include "prims/methodHandles.hpp"
  49 #include "runtime/jniHandles.hpp"
  50 #include "runtime/safepointMechanism.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/signature.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "runtime/vframeArray.hpp"
  55 #include "runtime/vm_version.hpp"
  56 #include "utilities/align.hpp"
  57 #include "utilities/formatBuffer.hpp"
  58 #include "vmreg_x86.inline.hpp"
  59 #ifdef COMPILER1
  60 #include "c1/c1_Runtime1.hpp"
  61 #endif
  62 #ifdef COMPILER2
  63 #include "opto/runtime.hpp"
  64 #endif
  65 #if INCLUDE_JVMCI
  66 #include "jvmci/jvmciJavaClasses.hpp"
  67 #endif
  68 
  69 #define __ masm->
  70 
  71 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  72 
  73 class SimpleRuntimeFrame {
  74 
  75   public:
  76 
  77   // Most of the runtime stubs have this simple frame layout.
  78   // This class exists to make the layout shared in one place.
  79   // Offsets are for compiler stack slots, which are jints.
  80   enum layout {
  81     // The frame sender code expects that rbp will be in the "natural" place and
  82     // will override any oopMap setting for it. We must therefore force the layout
  83     // so that it agrees with the frame sender code.
  84     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  85     rbp_off2,
  86     return_off, return_off2,
  87     framesize
  88   };
  89 };
  90 
  91 class RegisterSaver {
  92   // Capture info about frame layout.  Layout offsets are in jint
  93   // units because compiler frame slots are jints.
  94 #define XSAVE_AREA_BEGIN 160
  95 #define XSAVE_AREA_YMM_BEGIN 576
  96 #define XSAVE_AREA_OPMASK_BEGIN 1088
  97 #define XSAVE_AREA_ZMM_BEGIN 1152
  98 #define XSAVE_AREA_UPPERBANK 1664
  99 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 100 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 101 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 102 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 103 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 104   enum layout {
 105     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 106     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 107     DEF_XMM_OFFS(0),
 108     DEF_XMM_OFFS(1),
 109     // 2..15 are implied in range usage
 110     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 111     DEF_YMM_OFFS(0),
 112     DEF_YMM_OFFS(1),
 113     // 2..15 are implied in range usage
 114     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 115     DEF_OPMASK_OFFS(0),
 116     DEF_OPMASK_OFFS(1),
 117     // 2..7 are implied in range usage
 118     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 119     DEF_ZMM_OFFS(0),
 120     DEF_ZMM_OFFS(1),
 121     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 122     DEF_ZMM_UPPER_OFFS(16),
 123     DEF_ZMM_UPPER_OFFS(17),
 124     // 18..31 are implied in range usage
 125     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 126     fpu_stateH_end,
 127     r15_off, r15H_off,
 128     r14_off, r14H_off,
 129     r13_off, r13H_off,
 130     r12_off, r12H_off,
 131     r11_off, r11H_off,
 132     r10_off, r10H_off,
 133     r9_off,  r9H_off,
 134     r8_off,  r8H_off,
 135     rdi_off, rdiH_off,
 136     rsi_off, rsiH_off,
 137     ignore_off, ignoreH_off,  // extra copy of rbp
 138     rsp_off, rspH_off,
 139     rbx_off, rbxH_off,
 140     rdx_off, rdxH_off,
 141     rcx_off, rcxH_off,
 142     rax_off, raxH_off,
 143     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 144     align_off, alignH_off,
 145     flags_off, flagsH_off,
 146     // The frame sender code expects that rbp will be in the "natural" place and
 147     // will override any oopMap setting for it. We must therefore force the layout
 148     // so that it agrees with the frame sender code.
 149     rbp_off, rbpH_off,        // copy of rbp we will restore
 150     return_off, returnH_off,  // slot for return address
 151     reg_save_size             // size in compiler stack slots
 152   };
 153 
 154  public:
 155   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors);
 156   static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 157 
 158   // Offsets into the register save area
 159   // Used by deoptimization when it is managing result register
 160   // values on its own
 161 
 162   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 163   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 164   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 165   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 166   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 167 
 168   // During deoptimization only the result registers need to be restored,
 169   // all the other values have already been extracted.
 170   static void restore_result_registers(MacroAssembler* masm);
 171 };
 172 
 173 // Register is a class, but it would be assigned numerical value.
 174 // "0" is assigned for rax. Thus we need to ignore -Wnonnull.
 175 PRAGMA_DIAG_PUSH
 176 PRAGMA_NONNULL_IGNORED
 177 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
 178   int off = 0;
 179   int num_xmm_regs = XMMRegisterImpl::available_xmm_registers();
 180 #if COMPILER2_OR_JVMCI
 181   if (save_vectors && UseAVX == 0) {
 182     save_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 183   }
 184   assert(!save_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 185 #else
 186   save_vectors = false; // vectors are generated only by C2 and JVMCI
 187 #endif
 188 
 189   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 190   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 191   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 192   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 193   // CodeBlob frame size is in words.
 194   int frame_size_in_words = frame_size_in_bytes / wordSize;
 195   *total_frame_words = frame_size_in_words;
 196 
 197   // Save registers, fpu state, and flags.
 198   // We assume caller has already pushed the return address onto the
 199   // stack, so rsp is 8-byte aligned here.
 200   // We push rpb twice in this sequence because we want the real rbp
 201   // to be under the return like a normal enter.
 202 
 203   __ enter();          // rsp becomes 16-byte aligned here
 204   __ push_CPU_state(); // Push a multiple of 16 bytes
 205 
 206   // push cpu state handles this on EVEX enabled targets
 207   if (save_vectors) {
 208     // Save upper half of YMM registers(0..15)
 209     int base_addr = XSAVE_AREA_YMM_BEGIN;
 210     for (int n = 0; n < 16; n++) {
 211       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 212     }
 213     if (VM_Version::supports_evex()) {
 214       // Save upper half of ZMM registers(0..15)
 215       base_addr = XSAVE_AREA_ZMM_BEGIN;
 216       for (int n = 0; n < 16; n++) {
 217         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 218       }
 219       // Save full ZMM registers(16..num_xmm_regs)
 220       base_addr = XSAVE_AREA_UPPERBANK;
 221       off = 0;
 222       int vector_len = Assembler::AVX_512bit;
 223       for (int n = 16; n < num_xmm_regs; n++) {
 224         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 225       }
 226 #if COMPILER2_OR_JVMCI
 227       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 228       off = 0;
 229       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 230         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 231       }
 232 #endif
 233     }
 234   } else {
 235     if (VM_Version::supports_evex()) {
 236       // Save upper bank of ZMM registers(16..31) for double/float usage
 237       int base_addr = XSAVE_AREA_UPPERBANK;
 238       off = 0;
 239       for (int n = 16; n < num_xmm_regs; n++) {
 240         __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
 241       }
 242 #if COMPILER2_OR_JVMCI
 243       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 244       off = 0;
 245       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 246         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 247       }
 248 #endif
 249     }
 250   }
 251   __ vzeroupper();
 252   if (frame::arg_reg_save_area_bytes != 0) {
 253     // Allocate argument register save area
 254     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 255   }
 256 
 257   // Set an oopmap for the call site.  This oopmap will map all
 258   // oop-registers and debug-info registers as callee-saved.  This
 259   // will allow deoptimization at this safepoint to find all possible
 260   // debug-info recordings, as well as let GC find all oops.
 261 
 262   OopMapSet *oop_maps = new OopMapSet();
 263   OopMap* map = new OopMap(frame_size_in_slots, 0);
 264 
 265 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 266 
 267   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 271   // rbp location is known implicitly by the frame sender code, needs no oopmap
 272   // and the location where rbp was saved by is ignored
 273   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 283   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 284   // on EVEX enabled targets, we get it included in the xsave area
 285   off = xmm0_off;
 286   int delta = xmm1_off - off;
 287   for (int n = 0; n < 16; n++) {
 288     XMMRegister xmm_name = as_XMMRegister(n);
 289     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 290     off += delta;
 291   }
 292   if (UseAVX > 2) {
 293     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 294     off = zmm16_off;
 295     delta = zmm17_off - off;
 296     for (int n = 16; n < num_xmm_regs; n++) {
 297       XMMRegister zmm_name = as_XMMRegister(n);
 298       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 299       off += delta;
 300     }
 301   }
 302 
 303 #if COMPILER2_OR_JVMCI
 304   if (save_vectors) {
 305     // Save upper half of YMM registers(0..15)
 306     off = ymm0_off;
 307     delta = ymm1_off - ymm0_off;
 308     for (int n = 0; n < 16; n++) {
 309       XMMRegister ymm_name = as_XMMRegister(n);
 310       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 311       off += delta;
 312     }
 313     if (VM_Version::supports_evex()) {
 314       // Save upper half of ZMM registers(0..15)
 315       off = zmm0_off;
 316       delta = zmm1_off - zmm0_off;
 317       for (int n = 0; n < 16; n++) {
 318         XMMRegister zmm_name = as_XMMRegister(n);
 319         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 320         off += delta;
 321       }
 322     }
 323   }
 324 #endif // COMPILER2_OR_JVMCI
 325 
 326   // %%% These should all be a waste but we'll keep things as they were for now
 327   if (true) {
 328     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 332     // rbp location is known implicitly by the frame sender code, needs no oopmap
 333     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 343     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 344     // on EVEX enabled targets, we get it included in the xsave area
 345     off = xmm0H_off;
 346     delta = xmm1H_off - off;
 347     for (int n = 0; n < 16; n++) {
 348       XMMRegister xmm_name = as_XMMRegister(n);
 349       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 350       off += delta;
 351     }
 352     if (UseAVX > 2) {
 353       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 354       off = zmm16H_off;
 355       delta = zmm17H_off - off;
 356       for (int n = 16; n < num_xmm_regs; n++) {
 357         XMMRegister zmm_name = as_XMMRegister(n);
 358         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 359         off += delta;
 360       }
 361     }
 362   }
 363 
 364   return map;
 365 }
 366 PRAGMA_DIAG_POP
 367 
 368 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
 369   int num_xmm_regs = XMMRegisterImpl::available_xmm_registers();
 370   if (frame::arg_reg_save_area_bytes != 0) {
 371     // Pop arg register save area
 372     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 373   }
 374 
 375 #if COMPILER2_OR_JVMCI
 376   if (restore_vectors) {
 377     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 378     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 379   }
 380 #else
 381   assert(!restore_vectors, "vectors are generated only by C2");
 382 #endif
 383 
 384   __ vzeroupper();
 385 
 386   // On EVEX enabled targets everything is handled in pop fpu state
 387   if (restore_vectors) {
 388     // Restore upper half of YMM registers (0..15)
 389     int base_addr = XSAVE_AREA_YMM_BEGIN;
 390     for (int n = 0; n < 16; n++) {
 391       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 392     }
 393     if (VM_Version::supports_evex()) {
 394       // Restore upper half of ZMM registers (0..15)
 395       base_addr = XSAVE_AREA_ZMM_BEGIN;
 396       for (int n = 0; n < 16; n++) {
 397         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 398       }
 399       // Restore full ZMM registers(16..num_xmm_regs)
 400       base_addr = XSAVE_AREA_UPPERBANK;
 401       int vector_len = Assembler::AVX_512bit;
 402       int off = 0;
 403       for (int n = 16; n < num_xmm_regs; n++) {
 404         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 405       }
 406 #if COMPILER2_OR_JVMCI
 407       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 408       off = 0;
 409       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 410         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 411       }
 412 #endif
 413     }
 414   } else {
 415     if (VM_Version::supports_evex()) {
 416       // Restore upper bank of ZMM registers(16..31) for double/float usage
 417       int base_addr = XSAVE_AREA_UPPERBANK;
 418       int off = 0;
 419       for (int n = 16; n < num_xmm_regs; n++) {
 420         __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
 421       }
 422 #if COMPILER2_OR_JVMCI
 423       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 424       off = 0;
 425       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 426         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 427       }
 428 #endif
 429     }
 430   }
 431 
 432   // Recover CPU state
 433   __ pop_CPU_state();
 434   // Get the rbp described implicitly by the calling convention (no oopMap)
 435   __ pop(rbp);
 436 }
 437 
 438 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 439 
 440   // Just restore result register. Only used by deoptimization. By
 441   // now any callee save register that needs to be restored to a c2
 442   // caller of the deoptee has been extracted into the vframeArray
 443   // and will be stuffed into the c2i adapter we create for later
 444   // restoration so only result registers need to be restored here.
 445 
 446   // Restore fp result register
 447   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 448   // Restore integer result register
 449   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 450   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 451 
 452   // Pop all of the register save are off the stack except the return address
 453   __ addptr(rsp, return_offset_in_bytes());
 454 }
 455 
 456 // Is vector's size (in bytes) bigger than a size saved by default?
 457 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 458 bool SharedRuntime::is_wide_vector(int size) {
 459   return size > 16;
 460 }
 461 
 462 // ---------------------------------------------------------------------------
 463 // Read the array of BasicTypes from a signature, and compute where the
 464 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 465 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 466 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 467 // as framesizes are fixed.
 468 // VMRegImpl::stack0 refers to the first slot 0(sp).
 469 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 470 // up to RegisterImpl::number_of_registers) are the 64-bit
 471 // integer registers.
 472 
 473 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 474 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 475 // units regardless of build. Of course for i486 there is no 64 bit build
 476 
 477 // The Java calling convention is a "shifted" version of the C ABI.
 478 // By skipping the first C ABI register we can call non-static jni methods
 479 // with small numbers of arguments without having to shuffle the arguments
 480 // at all. Since we control the java ABI we ought to at least get some
 481 // advantage out of it.
 482 
 483 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 484                                            VMRegPair *regs,
 485                                            int total_args_passed) {
 486 
 487   // Create the mapping between argument positions and
 488   // registers.
 489   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 490     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 491   };
 492   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 493     j_farg0, j_farg1, j_farg2, j_farg3,
 494     j_farg4, j_farg5, j_farg6, j_farg7
 495   };
 496 
 497 
 498   uint int_args = 0;
 499   uint fp_args = 0;
 500   uint stk_args = 0; // inc by 2 each time
 501 
 502   for (int i = 0; i < total_args_passed; i++) {
 503     switch (sig_bt[i]) {
 504     case T_BOOLEAN:
 505     case T_CHAR:
 506     case T_BYTE:
 507     case T_SHORT:
 508     case T_INT:
 509       if (int_args < Argument::n_int_register_parameters_j) {
 510         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 511       } else {
 512         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 513         stk_args += 2;
 514       }
 515       break;
 516     case T_VOID:
 517       // halves of T_LONG or T_DOUBLE
 518       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 519       regs[i].set_bad();
 520       break;
 521     case T_LONG:
 522       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 523       // fall through
 524     case T_OBJECT:
 525     case T_ARRAY:
 526     case T_ADDRESS:
 527       if (int_args < Argument::n_int_register_parameters_j) {
 528         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 529       } else {
 530         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 531         stk_args += 2;
 532       }
 533       break;
 534     case T_FLOAT:
 535       if (fp_args < Argument::n_float_register_parameters_j) {
 536         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 537       } else {
 538         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 539         stk_args += 2;
 540       }
 541       break;
 542     case T_DOUBLE:
 543       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 544       if (fp_args < Argument::n_float_register_parameters_j) {
 545         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 546       } else {
 547         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 548         stk_args += 2;
 549       }
 550       break;
 551     default:
 552       ShouldNotReachHere();
 553       break;
 554     }
 555   }
 556 
 557   return align_up(stk_args, 2);
 558 }
 559 
 560 // Patch the callers callsite with entry to compiled code if it exists.
 561 static void patch_callers_callsite(MacroAssembler *masm) {
 562   Label L;
 563   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 564   __ jcc(Assembler::equal, L);
 565 
 566   // Save the current stack pointer
 567   __ mov(r13, rsp);
 568   // Schedule the branch target address early.
 569   // Call into the VM to patch the caller, then jump to compiled callee
 570   // rax isn't live so capture return address while we easily can
 571   __ movptr(rax, Address(rsp, 0));
 572 
 573   // align stack so push_CPU_state doesn't fault
 574   __ andptr(rsp, -(StackAlignmentInBytes));
 575   __ push_CPU_state();
 576   __ vzeroupper();
 577   // VM needs caller's callsite
 578   // VM needs target method
 579   // This needs to be a long call since we will relocate this adapter to
 580   // the codeBuffer and it may not reach
 581 
 582   // Allocate argument register save area
 583   if (frame::arg_reg_save_area_bytes != 0) {
 584     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 585   }
 586   __ mov(c_rarg0, rbx);
 587   __ mov(c_rarg1, rax);
 588   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 589 
 590   // De-allocate argument register save area
 591   if (frame::arg_reg_save_area_bytes != 0) {
 592     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 593   }
 594 
 595   __ vzeroupper();
 596   __ pop_CPU_state();
 597   // restore sp
 598   __ mov(rsp, r13);
 599   __ bind(L);
 600 }
 601 
 602 
 603 static void gen_c2i_adapter(MacroAssembler *masm,
 604                             int total_args_passed,
 605                             int comp_args_on_stack,
 606                             const BasicType *sig_bt,
 607                             const VMRegPair *regs,
 608                             Label& skip_fixup) {
 609   // Before we get into the guts of the C2I adapter, see if we should be here
 610   // at all.  We've come from compiled code and are attempting to jump to the
 611   // interpreter, which means the caller made a static call to get here
 612   // (vcalls always get a compiled target if there is one).  Check for a
 613   // compiled target.  If there is one, we need to patch the caller's call.
 614   patch_callers_callsite(masm);
 615 
 616   __ bind(skip_fixup);
 617 
 618   // Since all args are passed on the stack, total_args_passed *
 619   // Interpreter::stackElementSize is the space we need. Plus 1 because
 620   // we also account for the return address location since
 621   // we store it first rather than hold it in rax across all the shuffling
 622 
 623   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 624 
 625   // stack is aligned, keep it that way
 626   extraspace = align_up(extraspace, 2*wordSize);
 627 
 628   // Get return address
 629   __ pop(rax);
 630 
 631   // set senderSP value
 632   __ mov(r13, rsp);
 633 
 634   __ subptr(rsp, extraspace);
 635 
 636   // Store the return address in the expected location
 637   __ movptr(Address(rsp, 0), rax);
 638 
 639   // Now write the args into the outgoing interpreter space
 640   for (int i = 0; i < total_args_passed; i++) {
 641     if (sig_bt[i] == T_VOID) {
 642       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 643       continue;
 644     }
 645 
 646     // offset to start parameters
 647     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 648     int next_off = st_off - Interpreter::stackElementSize;
 649 
 650     // Say 4 args:
 651     // i   st_off
 652     // 0   32 T_LONG
 653     // 1   24 T_VOID
 654     // 2   16 T_OBJECT
 655     // 3    8 T_BOOL
 656     // -    0 return address
 657     //
 658     // However to make thing extra confusing. Because we can fit a long/double in
 659     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 660     // leaves one slot empty and only stores to a single slot. In this case the
 661     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 662 
 663     VMReg r_1 = regs[i].first();
 664     VMReg r_2 = regs[i].second();
 665     if (!r_1->is_valid()) {
 666       assert(!r_2->is_valid(), "");
 667       continue;
 668     }
 669     if (r_1->is_stack()) {
 670       // memory to memory use rax
 671       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 672       if (!r_2->is_valid()) {
 673         // sign extend??
 674         __ movl(rax, Address(rsp, ld_off));
 675         __ movptr(Address(rsp, st_off), rax);
 676 
 677       } else {
 678 
 679         __ movq(rax, Address(rsp, ld_off));
 680 
 681         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 682         // T_DOUBLE and T_LONG use two slots in the interpreter
 683         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 684           // ld_off == LSW, ld_off+wordSize == MSW
 685           // st_off == MSW, next_off == LSW
 686           __ movq(Address(rsp, next_off), rax);
 687 #ifdef ASSERT
 688           // Overwrite the unused slot with known junk
 689           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 690           __ movptr(Address(rsp, st_off), rax);
 691 #endif /* ASSERT */
 692         } else {
 693           __ movq(Address(rsp, st_off), rax);
 694         }
 695       }
 696     } else if (r_1->is_Register()) {
 697       Register r = r_1->as_Register();
 698       if (!r_2->is_valid()) {
 699         // must be only an int (or less ) so move only 32bits to slot
 700         // why not sign extend??
 701         __ movl(Address(rsp, st_off), r);
 702       } else {
 703         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 704         // T_DOUBLE and T_LONG use two slots in the interpreter
 705         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 706           // long/double in gpr
 707 #ifdef ASSERT
 708           // Overwrite the unused slot with known junk
 709           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 710           __ movptr(Address(rsp, st_off), rax);
 711 #endif /* ASSERT */
 712           __ movq(Address(rsp, next_off), r);
 713         } else {
 714           __ movptr(Address(rsp, st_off), r);
 715         }
 716       }
 717     } else {
 718       assert(r_1->is_XMMRegister(), "");
 719       if (!r_2->is_valid()) {
 720         // only a float use just part of the slot
 721         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 722       } else {
 723 #ifdef ASSERT
 724         // Overwrite the unused slot with known junk
 725         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 726         __ movptr(Address(rsp, st_off), rax);
 727 #endif /* ASSERT */
 728         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 729       }
 730     }
 731   }
 732 
 733   // Schedule the branch target address early.
 734   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 735   __ jmp(rcx);
 736 }
 737 
 738 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 739                         address code_start, address code_end,
 740                         Label& L_ok) {
 741   Label L_fail;
 742   __ lea(temp_reg, ExternalAddress(code_start));
 743   __ cmpptr(pc_reg, temp_reg);
 744   __ jcc(Assembler::belowEqual, L_fail);
 745   __ lea(temp_reg, ExternalAddress(code_end));
 746   __ cmpptr(pc_reg, temp_reg);
 747   __ jcc(Assembler::below, L_ok);
 748   __ bind(L_fail);
 749 }
 750 
 751 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 752                                     int total_args_passed,
 753                                     int comp_args_on_stack,
 754                                     const BasicType *sig_bt,
 755                                     const VMRegPair *regs) {
 756 
 757   // Note: r13 contains the senderSP on entry. We must preserve it since
 758   // we may do a i2c -> c2i transition if we lose a race where compiled
 759   // code goes non-entrant while we get args ready.
 760   // In addition we use r13 to locate all the interpreter args as
 761   // we must align the stack to 16 bytes on an i2c entry else we
 762   // lose alignment we expect in all compiled code and register
 763   // save code can segv when fxsave instructions find improperly
 764   // aligned stack pointer.
 765 
 766   // Adapters can be frameless because they do not require the caller
 767   // to perform additional cleanup work, such as correcting the stack pointer.
 768   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 769   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 770   // even if a callee has modified the stack pointer.
 771   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 772   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 773   // up via the senderSP register).
 774   // In other words, if *either* the caller or callee is interpreted, we can
 775   // get the stack pointer repaired after a call.
 776   // This is why c2i and i2c adapters cannot be indefinitely composed.
 777   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 778   // both caller and callee would be compiled methods, and neither would
 779   // clean up the stack pointer changes performed by the two adapters.
 780   // If this happens, control eventually transfers back to the compiled
 781   // caller, but with an uncorrected stack, causing delayed havoc.
 782 
 783   // Pick up the return address
 784   __ movptr(rax, Address(rsp, 0));
 785 
 786   if (VerifyAdapterCalls &&
 787       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 788     // So, let's test for cascading c2i/i2c adapters right now.
 789     //  assert(Interpreter::contains($return_addr) ||
 790     //         StubRoutines::contains($return_addr),
 791     //         "i2c adapter must return to an interpreter frame");
 792     __ block_comment("verify_i2c { ");
 793     Label L_ok;
 794     if (Interpreter::code() != NULL)
 795       range_check(masm, rax, r11,
 796                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 797                   L_ok);
 798     if (StubRoutines::code1() != NULL)
 799       range_check(masm, rax, r11,
 800                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 801                   L_ok);
 802     if (StubRoutines::code2() != NULL)
 803       range_check(masm, rax, r11,
 804                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 805                   L_ok);
 806     const char* msg = "i2c adapter must return to an interpreter frame";
 807     __ block_comment(msg);
 808     __ stop(msg);
 809     __ bind(L_ok);
 810     __ block_comment("} verify_i2ce ");
 811   }
 812 
 813   // Must preserve original SP for loading incoming arguments because
 814   // we need to align the outgoing SP for compiled code.
 815   __ movptr(r11, rsp);
 816 
 817   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
 818   // in registers, we will occasionally have no stack args.
 819   int comp_words_on_stack = 0;
 820   if (comp_args_on_stack) {
 821     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
 822     // registers are below.  By subtracting stack0, we either get a negative
 823     // number (all values in registers) or the maximum stack slot accessed.
 824 
 825     // Convert 4-byte c2 stack slots to words.
 826     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 827     // Round up to miminum stack alignment, in wordSize
 828     comp_words_on_stack = align_up(comp_words_on_stack, 2);
 829     __ subptr(rsp, comp_words_on_stack * wordSize);
 830   }
 831 
 832 
 833   // Ensure compiled code always sees stack at proper alignment
 834   __ andptr(rsp, -16);
 835 
 836   // push the return address and misalign the stack that youngest frame always sees
 837   // as far as the placement of the call instruction
 838   __ push(rax);
 839 
 840   // Put saved SP in another register
 841   const Register saved_sp = rax;
 842   __ movptr(saved_sp, r11);
 843 
 844   // Will jump to the compiled code just as if compiled code was doing it.
 845   // Pre-load the register-jump target early, to schedule it better.
 846   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 847 
 848 #if INCLUDE_JVMCI
 849   if (EnableJVMCI) {
 850     // check if this call should be routed towards a specific entry point
 851     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 852     Label no_alternative_target;
 853     __ jcc(Assembler::equal, no_alternative_target);
 854     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 855     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 856     __ bind(no_alternative_target);
 857   }
 858 #endif // INCLUDE_JVMCI
 859 
 860   // Now generate the shuffle code.  Pick up all register args and move the
 861   // rest through the floating point stack top.
 862   for (int i = 0; i < total_args_passed; i++) {
 863     if (sig_bt[i] == T_VOID) {
 864       // Longs and doubles are passed in native word order, but misaligned
 865       // in the 32-bit build.
 866       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 867       continue;
 868     }
 869 
 870     // Pick up 0, 1 or 2 words from SP+offset.
 871 
 872     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 873             "scrambled load targets?");
 874     // Load in argument order going down.
 875     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 876     // Point to interpreter value (vs. tag)
 877     int next_off = ld_off - Interpreter::stackElementSize;
 878     //
 879     //
 880     //
 881     VMReg r_1 = regs[i].first();
 882     VMReg r_2 = regs[i].second();
 883     if (!r_1->is_valid()) {
 884       assert(!r_2->is_valid(), "");
 885       continue;
 886     }
 887     if (r_1->is_stack()) {
 888       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 889       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 890 
 891       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 892       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 893       // will be generated.
 894       if (!r_2->is_valid()) {
 895         // sign extend???
 896         __ movl(r13, Address(saved_sp, ld_off));
 897         __ movptr(Address(rsp, st_off), r13);
 898       } else {
 899         //
 900         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 901         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 902         // So we must adjust where to pick up the data to match the interpreter.
 903         //
 904         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 905         // are accessed as negative so LSW is at LOW address
 906 
 907         // ld_off is MSW so get LSW
 908         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 909                            next_off : ld_off;
 910         __ movq(r13, Address(saved_sp, offset));
 911         // st_off is LSW (i.e. reg.first())
 912         __ movq(Address(rsp, st_off), r13);
 913       }
 914     } else if (r_1->is_Register()) {  // Register argument
 915       Register r = r_1->as_Register();
 916       assert(r != rax, "must be different");
 917       if (r_2->is_valid()) {
 918         //
 919         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 920         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 921         // So we must adjust where to pick up the data to match the interpreter.
 922 
 923         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 924                            next_off : ld_off;
 925 
 926         // this can be a misaligned move
 927         __ movq(r, Address(saved_sp, offset));
 928       } else {
 929         // sign extend and use a full word?
 930         __ movl(r, Address(saved_sp, ld_off));
 931       }
 932     } else {
 933       if (!r_2->is_valid()) {
 934         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 935       } else {
 936         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 937       }
 938     }
 939   }
 940 
 941   // 6243940 We might end up in handle_wrong_method if
 942   // the callee is deoptimized as we race thru here. If that
 943   // happens we don't want to take a safepoint because the
 944   // caller frame will look interpreted and arguments are now
 945   // "compiled" so it is much better to make this transition
 946   // invisible to the stack walking code. Unfortunately if
 947   // we try and find the callee by normal means a safepoint
 948   // is possible. So we stash the desired callee in the thread
 949   // and the vm will find there should this case occur.
 950 
 951   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 952 
 953   // put Method* where a c2i would expect should we end up there
 954   // only needed because eof c2 resolve stubs return Method* as a result in
 955   // rax
 956   __ mov(rax, rbx);
 957   __ jmp(r11);
 958 }
 959 
 960 // ---------------------------------------------------------------
 961 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 962                                                             int total_args_passed,
 963                                                             int comp_args_on_stack,
 964                                                             const BasicType *sig_bt,
 965                                                             const VMRegPair *regs,
 966                                                             AdapterFingerPrint* fingerprint) {
 967   address i2c_entry = __ pc();
 968 
 969   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 970 
 971   // -------------------------------------------------------------------------
 972   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 973   // to the interpreter.  The args start out packed in the compiled layout.  They
 974   // need to be unpacked into the interpreter layout.  This will almost always
 975   // require some stack space.  We grow the current (compiled) stack, then repack
 976   // the args.  We  finally end in a jump to the generic interpreter entry point.
 977   // On exit from the interpreter, the interpreter will restore our SP (lest the
 978   // compiled code, which relies solely on SP and not RBP, get sick).
 979 
 980   address c2i_unverified_entry = __ pc();
 981   Label skip_fixup;
 982   Label ok;
 983 
 984   Register holder = rax;
 985   Register receiver = j_rarg0;
 986   Register temp = rbx;
 987 
 988   {
 989     __ load_klass(temp, receiver, rscratch1);
 990     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
 991     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
 992     __ jcc(Assembler::equal, ok);
 993     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 994 
 995     __ bind(ok);
 996     // Method might have been compiled since the call site was patched to
 997     // interpreted if that is the case treat it as a miss so we can get
 998     // the call site corrected.
 999     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1000     __ jcc(Assembler::equal, skip_fixup);
1001     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1002   }
1003 
1004   address c2i_entry = __ pc();
1005 
1006   // Class initialization barrier for static methods
1007   address c2i_no_clinit_check_entry = NULL;
1008   if (VM_Version::supports_fast_class_init_checks()) {
1009     Label L_skip_barrier;
1010     Register method = rbx;
1011 
1012     { // Bypass the barrier for non-static methods
1013       Register flags  = rscratch1;
1014       __ movl(flags, Address(method, Method::access_flags_offset()));
1015       __ testl(flags, JVM_ACC_STATIC);
1016       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1017     }
1018 
1019     Register klass = rscratch1;
1020     __ load_method_holder(klass, method);
1021     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1022 
1023     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1024 
1025     __ bind(L_skip_barrier);
1026     c2i_no_clinit_check_entry = __ pc();
1027   }
1028 
1029   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1030   bs->c2i_entry_barrier(masm);
1031 
1032   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1033 
1034   __ flush();
1035   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1036 }
1037 
1038 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1039                                          VMRegPair *regs,
1040                                          VMRegPair *regs2,
1041                                          int total_args_passed) {
1042   assert(regs2 == NULL, "not needed on x86");
1043 // We return the amount of VMRegImpl stack slots we need to reserve for all
1044 // the arguments NOT counting out_preserve_stack_slots.
1045 
1046 // NOTE: These arrays will have to change when c1 is ported
1047 #ifdef _WIN64
1048     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1049       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1050     };
1051     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1052       c_farg0, c_farg1, c_farg2, c_farg3
1053     };
1054 #else
1055     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1056       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1057     };
1058     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1059       c_farg0, c_farg1, c_farg2, c_farg3,
1060       c_farg4, c_farg5, c_farg6, c_farg7
1061     };
1062 #endif // _WIN64
1063 
1064 
1065     uint int_args = 0;
1066     uint fp_args = 0;
1067     uint stk_args = 0; // inc by 2 each time
1068 
1069     for (int i = 0; i < total_args_passed; i++) {
1070       switch (sig_bt[i]) {
1071       case T_BOOLEAN:
1072       case T_CHAR:
1073       case T_BYTE:
1074       case T_SHORT:
1075       case T_INT:
1076         if (int_args < Argument::n_int_register_parameters_c) {
1077           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1078 #ifdef _WIN64
1079           fp_args++;
1080           // Allocate slots for callee to stuff register args the stack.
1081           stk_args += 2;
1082 #endif
1083         } else {
1084           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1085           stk_args += 2;
1086         }
1087         break;
1088       case T_LONG:
1089         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1090         // fall through
1091       case T_OBJECT:
1092       case T_ARRAY:
1093       case T_ADDRESS:
1094       case T_METADATA:
1095         if (int_args < Argument::n_int_register_parameters_c) {
1096           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1097 #ifdef _WIN64
1098           fp_args++;
1099           stk_args += 2;
1100 #endif
1101         } else {
1102           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1103           stk_args += 2;
1104         }
1105         break;
1106       case T_FLOAT:
1107         if (fp_args < Argument::n_float_register_parameters_c) {
1108           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1109 #ifdef _WIN64
1110           int_args++;
1111           // Allocate slots for callee to stuff register args the stack.
1112           stk_args += 2;
1113 #endif
1114         } else {
1115           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1116           stk_args += 2;
1117         }
1118         break;
1119       case T_DOUBLE:
1120         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1121         if (fp_args < Argument::n_float_register_parameters_c) {
1122           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1123 #ifdef _WIN64
1124           int_args++;
1125           // Allocate slots for callee to stuff register args the stack.
1126           stk_args += 2;
1127 #endif
1128         } else {
1129           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1130           stk_args += 2;
1131         }
1132         break;
1133       case T_VOID: // Halves of longs and doubles
1134         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1135         regs[i].set_bad();
1136         break;
1137       default:
1138         ShouldNotReachHere();
1139         break;
1140       }
1141     }
1142 #ifdef _WIN64
1143   // windows abi requires that we always allocate enough stack space
1144   // for 4 64bit registers to be stored down.
1145   if (stk_args < 8) {
1146     stk_args = 8;
1147   }
1148 #endif // _WIN64
1149 
1150   return stk_args;
1151 }
1152 
1153 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1154                                              uint num_bits,
1155                                              uint total_args_passed) {
1156   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1157          "only certain vector sizes are supported for now");
1158 
1159   static const XMMRegister VEC_ArgReg[32] = {
1160      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1161      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1162     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1163     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1164   };
1165 
1166   uint stk_args = 0;
1167   uint fp_args = 0;
1168 
1169   for (uint i = 0; i < total_args_passed; i++) {
1170     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1171     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1172     regs[i].set_pair(vmreg->next(next_val), vmreg);
1173   }
1174 
1175   return stk_args;
1176 }
1177 
1178 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1179   // We always ignore the frame_slots arg and just use the space just below frame pointer
1180   // which by this time is free to use
1181   switch (ret_type) {
1182   case T_FLOAT:
1183     __ movflt(Address(rbp, -wordSize), xmm0);
1184     break;
1185   case T_DOUBLE:
1186     __ movdbl(Address(rbp, -wordSize), xmm0);
1187     break;
1188   case T_VOID:  break;
1189   default: {
1190     __ movptr(Address(rbp, -wordSize), rax);
1191     }
1192   }
1193 }
1194 
1195 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1196   // We always ignore the frame_slots arg and just use the space just below frame pointer
1197   // which by this time is free to use
1198   switch (ret_type) {
1199   case T_FLOAT:
1200     __ movflt(xmm0, Address(rbp, -wordSize));
1201     break;
1202   case T_DOUBLE:
1203     __ movdbl(xmm0, Address(rbp, -wordSize));
1204     break;
1205   case T_VOID:  break;
1206   default: {
1207     __ movptr(rax, Address(rbp, -wordSize));
1208     }
1209   }
1210 }
1211 
1212 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1213     for ( int i = first_arg ; i < arg_count ; i++ ) {
1214       if (args[i].first()->is_Register()) {
1215         __ push(args[i].first()->as_Register());
1216       } else if (args[i].first()->is_XMMRegister()) {
1217         __ subptr(rsp, 2*wordSize);
1218         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1219       }
1220     }
1221 }
1222 
1223 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1224     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1225       if (args[i].first()->is_Register()) {
1226         __ pop(args[i].first()->as_Register());
1227       } else if (args[i].first()->is_XMMRegister()) {
1228         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1229         __ addptr(rsp, 2*wordSize);
1230       }
1231     }
1232 }
1233 
1234 // Different signatures may require very different orders for the move
1235 // to avoid clobbering other arguments.  There's no simple way to
1236 // order them safely.  Compute a safe order for issuing stores and
1237 // break any cycles in those stores.  This code is fairly general but
1238 // it's not necessary on the other platforms so we keep it in the
1239 // platform dependent code instead of moving it into a shared file.
1240 // (See bugs 7013347 & 7145024.)
1241 // Note that this code is specific to LP64.
1242 class ComputeMoveOrder: public StackObj {
1243   class MoveOperation: public ResourceObj {
1244     friend class ComputeMoveOrder;
1245    private:
1246     VMRegPair        _src;
1247     VMRegPair        _dst;
1248     int              _src_index;
1249     int              _dst_index;
1250     bool             _processed;
1251     MoveOperation*  _next;
1252     MoveOperation*  _prev;
1253 
1254     static int get_id(VMRegPair r) {
1255       return r.first()->value();
1256     }
1257 
1258    public:
1259     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1260       _src(src)
1261     , _dst(dst)
1262     , _src_index(src_index)
1263     , _dst_index(dst_index)
1264     , _processed(false)
1265     , _next(NULL)
1266     , _prev(NULL) {
1267     }
1268 
1269     VMRegPair src() const              { return _src; }
1270     int src_id() const                 { return get_id(src()); }
1271     int src_index() const              { return _src_index; }
1272     VMRegPair dst() const              { return _dst; }
1273     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1274     int dst_index() const              { return _dst_index; }
1275     int dst_id() const                 { return get_id(dst()); }
1276     MoveOperation* next() const       { return _next; }
1277     MoveOperation* prev() const       { return _prev; }
1278     void set_processed()               { _processed = true; }
1279     bool is_processed() const          { return _processed; }
1280 
1281     // insert
1282     void break_cycle(VMRegPair temp_register) {
1283       // create a new store following the last store
1284       // to move from the temp_register to the original
1285       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1286 
1287       // break the cycle of links and insert new_store at the end
1288       // break the reverse link.
1289       MoveOperation* p = prev();
1290       assert(p->next() == this, "must be");
1291       _prev = NULL;
1292       p->_next = new_store;
1293       new_store->_prev = p;
1294 
1295       // change the original store to save it's value in the temp.
1296       set_dst(-1, temp_register);
1297     }
1298 
1299     void link(GrowableArray<MoveOperation*>& killer) {
1300       // link this store in front the store that it depends on
1301       MoveOperation* n = killer.at_grow(src_id(), NULL);
1302       if (n != NULL) {
1303         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1304         _next = n;
1305         n->_prev = this;
1306       }
1307     }
1308   };
1309 
1310  private:
1311   GrowableArray<MoveOperation*> edges;
1312 
1313  public:
1314   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1315                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1316     // Move operations where the dest is the stack can all be
1317     // scheduled first since they can't interfere with the other moves.
1318     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1319       if (in_sig_bt[i] == T_ARRAY) {
1320         c_arg--;
1321         if (out_regs[c_arg].first()->is_stack() &&
1322             out_regs[c_arg + 1].first()->is_stack()) {
1323           arg_order.push(i);
1324           arg_order.push(c_arg);
1325         } else {
1326           if (out_regs[c_arg].first()->is_stack() ||
1327               in_regs[i].first() == out_regs[c_arg].first()) {
1328             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1329           } else {
1330             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1331           }
1332         }
1333       } else if (in_sig_bt[i] == T_VOID) {
1334         arg_order.push(i);
1335         arg_order.push(c_arg);
1336       } else {
1337         if (out_regs[c_arg].first()->is_stack() ||
1338             in_regs[i].first() == out_regs[c_arg].first()) {
1339           arg_order.push(i);
1340           arg_order.push(c_arg);
1341         } else {
1342           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1343         }
1344       }
1345     }
1346     // Break any cycles in the register moves and emit the in the
1347     // proper order.
1348     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1349     for (int i = 0; i < stores->length(); i++) {
1350       arg_order.push(stores->at(i)->src_index());
1351       arg_order.push(stores->at(i)->dst_index());
1352     }
1353  }
1354 
1355   // Collected all the move operations
1356   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1357     if (src.first() == dst.first()) return;
1358     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1359   }
1360 
1361   // Walk the edges breaking cycles between moves.  The result list
1362   // can be walked in order to produce the proper set of loads
1363   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1364     // Record which moves kill which values
1365     GrowableArray<MoveOperation*> killer;
1366     for (int i = 0; i < edges.length(); i++) {
1367       MoveOperation* s = edges.at(i);
1368       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1369       killer.at_put_grow(s->dst_id(), s, NULL);
1370     }
1371     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1372            "make sure temp isn't in the registers that are killed");
1373 
1374     // create links between loads and stores
1375     for (int i = 0; i < edges.length(); i++) {
1376       edges.at(i)->link(killer);
1377     }
1378 
1379     // at this point, all the move operations are chained together
1380     // in a doubly linked list.  Processing it backwards finds
1381     // the beginning of the chain, forwards finds the end.  If there's
1382     // a cycle it can be broken at any point,  so pick an edge and walk
1383     // backward until the list ends or we end where we started.
1384     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1385     for (int e = 0; e < edges.length(); e++) {
1386       MoveOperation* s = edges.at(e);
1387       if (!s->is_processed()) {
1388         MoveOperation* start = s;
1389         // search for the beginning of the chain or cycle
1390         while (start->prev() != NULL && start->prev() != s) {
1391           start = start->prev();
1392         }
1393         if (start->prev() == s) {
1394           start->break_cycle(temp_register);
1395         }
1396         // walk the chain forward inserting to store list
1397         while (start != NULL) {
1398           stores->append(start);
1399           start->set_processed();
1400           start = start->next();
1401         }
1402       }
1403     }
1404     return stores;
1405   }
1406 };
1407 
1408 static void verify_oop_args(MacroAssembler* masm,
1409                             const methodHandle& method,
1410                             const BasicType* sig_bt,
1411                             const VMRegPair* regs) {
1412   Register temp_reg = rbx;  // not part of any compiled calling seq
1413   if (VerifyOops) {
1414     for (int i = 0; i < method->size_of_parameters(); i++) {
1415       if (is_reference_type(sig_bt[i])) {
1416         VMReg r = regs[i].first();
1417         assert(r->is_valid(), "bad oop arg");
1418         if (r->is_stack()) {
1419           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1420           __ verify_oop(temp_reg);
1421         } else {
1422           __ verify_oop(r->as_Register());
1423         }
1424       }
1425     }
1426   }
1427 }
1428 
1429 static void gen_special_dispatch(MacroAssembler* masm,
1430                                  const methodHandle& method,
1431                                  const BasicType* sig_bt,
1432                                  const VMRegPair* regs) {
1433   verify_oop_args(masm, method, sig_bt, regs);
1434   vmIntrinsics::ID iid = method->intrinsic_id();
1435 
1436   // Now write the args into the outgoing interpreter space
1437   bool     has_receiver   = false;
1438   Register receiver_reg   = noreg;
1439   int      member_arg_pos = -1;
1440   Register member_reg     = noreg;
1441   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1442   if (ref_kind != 0) {
1443     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1444     member_reg = rbx;  // known to be free at this point
1445     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1446   } else if (iid == vmIntrinsics::_invokeBasic) {
1447     has_receiver = true;
1448   } else if (iid == vmIntrinsics::_linkToNative) {
1449     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1450     member_reg = rbx;  // known to be free at this point
1451   } else {
1452     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1453   }
1454 
1455   if (member_reg != noreg) {
1456     // Load the member_arg into register, if necessary.
1457     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1458     VMReg r = regs[member_arg_pos].first();
1459     if (r->is_stack()) {
1460       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1461     } else {
1462       // no data motion is needed
1463       member_reg = r->as_Register();
1464     }
1465   }
1466 
1467   if (has_receiver) {
1468     // Make sure the receiver is loaded into a register.
1469     assert(method->size_of_parameters() > 0, "oob");
1470     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1471     VMReg r = regs[0].first();
1472     assert(r->is_valid(), "bad receiver arg");
1473     if (r->is_stack()) {
1474       // Porting note:  This assumes that compiled calling conventions always
1475       // pass the receiver oop in a register.  If this is not true on some
1476       // platform, pick a temp and load the receiver from stack.
1477       fatal("receiver always in a register");
1478       receiver_reg = j_rarg0;  // known to be free at this point
1479       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1480     } else {
1481       // no data motion is needed
1482       receiver_reg = r->as_Register();
1483     }
1484   }
1485 
1486   // Figure out which address we are really jumping to:
1487   MethodHandles::generate_method_handle_dispatch(masm, iid,
1488                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1489 }
1490 
1491 // ---------------------------------------------------------------------------
1492 // Generate a native wrapper for a given method.  The method takes arguments
1493 // in the Java compiled code convention, marshals them to the native
1494 // convention (handlizes oops, etc), transitions to native, makes the call,
1495 // returns to java state (possibly blocking), unhandlizes any result and
1496 // returns.
1497 //
1498 // Critical native functions are a shorthand for the use of
1499 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1500 // functions.  The wrapper is expected to unpack the arguments before
1501 // passing them to the callee. Critical native functions leave the state _in_Java,
1502 // since they cannot stop for GC.
1503 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1504 // block and the check for pending exceptions it's impossible for them
1505 // to be thrown.
1506 //
1507 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1508                                                 const methodHandle& method,
1509                                                 int compile_id,
1510                                                 BasicType* in_sig_bt,
1511                                                 VMRegPair* in_regs,
1512                                                 BasicType ret_type) {
1513   if (method->is_method_handle_intrinsic()) {
1514     vmIntrinsics::ID iid = method->intrinsic_id();
1515     intptr_t start = (intptr_t)__ pc();
1516     int vep_offset = ((intptr_t)__ pc()) - start;
1517     gen_special_dispatch(masm,
1518                          method,
1519                          in_sig_bt,
1520                          in_regs);
1521     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1522     __ flush();
1523     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1524     return nmethod::new_native_nmethod(method,
1525                                        compile_id,
1526                                        masm->code(),
1527                                        vep_offset,
1528                                        frame_complete,
1529                                        stack_slots / VMRegImpl::slots_per_word,
1530                                        in_ByteSize(-1),
1531                                        in_ByteSize(-1),
1532                                        (OopMapSet*)NULL);
1533   }
1534   address native_func = method->native_function();
1535   assert(native_func != NULL, "must have function");
1536 
1537   // An OopMap for lock (and class if static)
1538   OopMapSet *oop_maps = new OopMapSet();
1539   intptr_t start = (intptr_t)__ pc();
1540 
1541   // We have received a description of where all the java arg are located
1542   // on entry to the wrapper. We need to convert these args to where
1543   // the jni function will expect them. To figure out where they go
1544   // we convert the java signature to a C signature by inserting
1545   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1546 
1547   const int total_in_args = method->size_of_parameters();
1548   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1549 
1550   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1551   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1552   BasicType* in_elem_bt = NULL;
1553 
1554   int argc = 0;
1555   out_sig_bt[argc++] = T_ADDRESS;
1556   if (method->is_static()) {
1557     out_sig_bt[argc++] = T_OBJECT;
1558   }
1559 
1560   for (int i = 0; i < total_in_args ; i++ ) {
1561     out_sig_bt[argc++] = in_sig_bt[i];
1562   }
1563 
1564   // Now figure out where the args must be stored and how much stack space
1565   // they require.
1566   int out_arg_slots;
1567   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1568 
1569   // Compute framesize for the wrapper.  We need to handlize all oops in
1570   // incoming registers
1571 
1572   // Calculate the total number of stack slots we will need.
1573 
1574   // First count the abi requirement plus all of the outgoing args
1575   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1576 
1577   // Now the space for the inbound oop handle area
1578   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1579 
1580   int oop_handle_offset = stack_slots;
1581   stack_slots += total_save_slots;
1582 
1583   // Now any space we need for handlizing a klass if static method
1584 
1585   int klass_slot_offset = 0;
1586   int klass_offset = -1;
1587   int lock_slot_offset = 0;
1588   bool is_static = false;
1589 
1590   if (method->is_static()) {
1591     klass_slot_offset = stack_slots;
1592     stack_slots += VMRegImpl::slots_per_word;
1593     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1594     is_static = true;
1595   }
1596 
1597   // Plus a lock if needed
1598 
1599   if (method->is_synchronized()) {
1600     lock_slot_offset = stack_slots;
1601     stack_slots += VMRegImpl::slots_per_word;
1602   }
1603 
1604   // Now a place (+2) to save return values or temp during shuffling
1605   // + 4 for return address (which we own) and saved rbp
1606   stack_slots += 6;
1607 
1608   // Ok The space we have allocated will look like:
1609   //
1610   //
1611   // FP-> |                     |
1612   //      |---------------------|
1613   //      | 2 slots for moves   |
1614   //      |---------------------|
1615   //      | lock box (if sync)  |
1616   //      |---------------------| <- lock_slot_offset
1617   //      | klass (if static)   |
1618   //      |---------------------| <- klass_slot_offset
1619   //      | oopHandle area      |
1620   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1621   //      | outbound memory     |
1622   //      | based arguments     |
1623   //      |                     |
1624   //      |---------------------|
1625   //      |                     |
1626   // SP-> | out_preserved_slots |
1627   //
1628   //
1629 
1630 
1631   // Now compute actual number of stack words we need rounding to make
1632   // stack properly aligned.
1633   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1634 
1635   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1636 
1637   // First thing make an ic check to see if we should even be here
1638 
1639   // We are free to use all registers as temps without saving them and
1640   // restoring them except rbp. rbp is the only callee save register
1641   // as far as the interpreter and the compiler(s) are concerned.
1642 
1643 
1644   const Register ic_reg = rax;
1645   const Register receiver = j_rarg0;
1646 
1647   Label hit;
1648   Label exception_pending;
1649 
1650   assert_different_registers(ic_reg, receiver, rscratch1);
1651   __ verify_oop(receiver);
1652   __ load_klass(rscratch1, receiver, rscratch2);
1653   __ cmpq(ic_reg, rscratch1);
1654   __ jcc(Assembler::equal, hit);
1655 
1656   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1657 
1658   // Verified entry point must be aligned
1659   __ align(8);
1660 
1661   __ bind(hit);
1662 
1663   int vep_offset = ((intptr_t)__ pc()) - start;
1664 
1665   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1666     Label L_skip_barrier;
1667     Register klass = r10;
1668     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1669     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1670 
1671     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1672 
1673     __ bind(L_skip_barrier);
1674   }
1675 
1676 #ifdef COMPILER1
1677   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1678   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1679     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1680   }
1681 #endif // COMPILER1
1682 
1683   // The instruction at the verified entry point must be 5 bytes or longer
1684   // because it can be patched on the fly by make_non_entrant. The stack bang
1685   // instruction fits that requirement.
1686 
1687   // Generate stack overflow check
1688   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1689 
1690   // Generate a new frame for the wrapper.
1691   __ enter();
1692   // -2 because return address is already present and so is saved rbp
1693   __ subptr(rsp, stack_size - 2*wordSize);
1694 
1695   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1696   bs->nmethod_entry_barrier(masm);
1697 
1698   // Frame is now completed as far as size and linkage.
1699   int frame_complete = ((intptr_t)__ pc()) - start;
1700 
1701     if (UseRTMLocking) {
1702       // Abort RTM transaction before calling JNI
1703       // because critical section will be large and will be
1704       // aborted anyway. Also nmethod could be deoptimized.
1705       __ xabort(0);
1706     }
1707 
1708 #ifdef ASSERT
1709     {
1710       Label L;
1711       __ mov(rax, rsp);
1712       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1713       __ cmpptr(rax, rsp);
1714       __ jcc(Assembler::equal, L);
1715       __ stop("improperly aligned stack");
1716       __ bind(L);
1717     }
1718 #endif /* ASSERT */
1719 
1720 
1721   // We use r14 as the oop handle for the receiver/klass
1722   // It is callee save so it survives the call to native
1723 
1724   const Register oop_handle_reg = r14;
1725 
1726   //
1727   // We immediately shuffle the arguments so that any vm call we have to
1728   // make from here on out (sync slow path, jvmti, etc.) we will have
1729   // captured the oops from our caller and have a valid oopMap for
1730   // them.
1731 
1732   // -----------------
1733   // The Grand Shuffle
1734 
1735   // The Java calling convention is either equal (linux) or denser (win64) than the
1736   // c calling convention. However the because of the jni_env argument the c calling
1737   // convention always has at least one more (and two for static) arguments than Java.
1738   // Therefore if we move the args from java -> c backwards then we will never have
1739   // a register->register conflict and we don't have to build a dependency graph
1740   // and figure out how to break any cycles.
1741   //
1742 
1743   // Record esp-based slot for receiver on stack for non-static methods
1744   int receiver_offset = -1;
1745 
1746   // This is a trick. We double the stack slots so we can claim
1747   // the oops in the caller's frame. Since we are sure to have
1748   // more args than the caller doubling is enough to make
1749   // sure we can capture all the incoming oop args from the
1750   // caller.
1751   //
1752   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1753 
1754   // Mark location of rbp (someday)
1755   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1756 
1757   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1758   // All inbound args are referenced based on rbp and all outbound args via rsp.
1759 
1760 
1761 #ifdef ASSERT
1762   bool reg_destroyed[RegisterImpl::number_of_registers];
1763   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1764   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1765     reg_destroyed[r] = false;
1766   }
1767   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1768     freg_destroyed[f] = false;
1769   }
1770 
1771 #endif /* ASSERT */
1772 
1773   // For JNI natives the incoming and outgoing registers are offset upwards.
1774   GrowableArray<int> arg_order(2 * total_in_args);
1775 
1776   VMRegPair tmp_vmreg;
1777   tmp_vmreg.set2(rbx->as_VMReg());
1778 
1779   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1780     arg_order.push(i);
1781     arg_order.push(c_arg);
1782   }
1783 
1784   int temploc = -1;
1785   for (int ai = 0; ai < arg_order.length(); ai += 2) {
1786     int i = arg_order.at(ai);
1787     int c_arg = arg_order.at(ai + 1);
1788     __ block_comment(err_msg("move %d -> %d", i, c_arg));
1789 #ifdef ASSERT
1790     if (in_regs[i].first()->is_Register()) {
1791       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1792     } else if (in_regs[i].first()->is_XMMRegister()) {
1793       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1794     }
1795     if (out_regs[c_arg].first()->is_Register()) {
1796       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1797     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1798       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1799     }
1800 #endif /* ASSERT */
1801     switch (in_sig_bt[i]) {
1802       case T_ARRAY:
1803       case T_OBJECT:
1804         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1805                     ((i == 0) && (!is_static)),
1806                     &receiver_offset);
1807         break;
1808       case T_VOID:
1809         break;
1810 
1811       case T_FLOAT:
1812         __ float_move(in_regs[i], out_regs[c_arg]);
1813           break;
1814 
1815       case T_DOUBLE:
1816         assert( i + 1 < total_in_args &&
1817                 in_sig_bt[i + 1] == T_VOID &&
1818                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1819         __ double_move(in_regs[i], out_regs[c_arg]);
1820         break;
1821 
1822       case T_LONG :
1823         __ long_move(in_regs[i], out_regs[c_arg]);
1824         break;
1825 
1826       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1827 
1828       default:
1829         __ move32_64(in_regs[i], out_regs[c_arg]);
1830     }
1831   }
1832 
1833   int c_arg;
1834 
1835   // Pre-load a static method's oop into r14.  Used both by locking code and
1836   // the normal JNI call code.
1837   // point c_arg at the first arg that is already loaded in case we
1838   // need to spill before we call out
1839   c_arg = total_c_args - total_in_args;
1840 
1841   if (method->is_static()) {
1842 
1843     //  load oop into a register
1844     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1845 
1846     // Now handlize the static class mirror it's known not-null.
1847     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1848     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1849 
1850     // Now get the handle
1851     __ lea(oop_handle_reg, Address(rsp, klass_offset));
1852     // store the klass handle as second argument
1853     __ movptr(c_rarg1, oop_handle_reg);
1854     // and protect the arg if we must spill
1855     c_arg--;
1856   }
1857 
1858   // Change state to native (we save the return address in the thread, since it might not
1859   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
1860   // points into the right code segment. It does not have to be the correct return pc.
1861   // We use the same pc/oopMap repeatedly when we call out
1862 
1863   intptr_t the_pc = (intptr_t) __ pc();
1864   oop_maps->add_gc_map(the_pc - start, map);
1865 
1866   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
1867 
1868 
1869   // We have all of the arguments setup at this point. We must not touch any register
1870   // argument registers at this point (what if we save/restore them there are no oop?
1871 
1872   {
1873     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
1874     // protect the args we've loaded
1875     save_args(masm, total_c_args, c_arg, out_regs);
1876     __ mov_metadata(c_rarg1, method());
1877     __ call_VM_leaf(
1878       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
1879       r15_thread, c_rarg1);
1880     restore_args(masm, total_c_args, c_arg, out_regs);
1881   }
1882 
1883   // RedefineClasses() tracing support for obsolete method entry
1884   if (log_is_enabled(Trace, redefine, class, obsolete)) {
1885     // protect the args we've loaded
1886     save_args(masm, total_c_args, c_arg, out_regs);
1887     __ mov_metadata(c_rarg1, method());
1888     __ call_VM_leaf(
1889       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
1890       r15_thread, c_rarg1);
1891     restore_args(masm, total_c_args, c_arg, out_regs);
1892   }
1893 
1894   // Lock a synchronized method
1895 
1896   // Register definitions used by locking and unlocking
1897 
1898   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
1899   const Register obj_reg  = rbx;  // Will contain the oop
1900   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
1901   const Register old_hdr  = r13;  // value of old header at unlock time
1902 
1903   Label slow_path_lock;
1904   Label lock_done;
1905 
1906   if (method->is_synchronized()) {
1907 
1908     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
1909 
1910     // Get the handle (the 2nd argument)
1911     __ mov(oop_handle_reg, c_rarg1);
1912 
1913     // Get address of the box
1914 
1915     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
1916 
1917     // Load the oop from the handle
1918     __ movptr(obj_reg, Address(oop_handle_reg, 0));
1919 
1920     if (!UseHeavyMonitors) {
1921       // Load immediate 1 into swap_reg %rax
1922       __ movl(swap_reg, 1);
1923 
1924       // Load (object->mark() | 1) into swap_reg %rax
1925       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1926 
1927       // Save (object->mark() | 1) into BasicLock's displaced header
1928       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
1929 
1930       // src -> dest iff dest == rax else rax <- dest
1931       __ lock();
1932       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1933       __ jcc(Assembler::equal, lock_done);
1934 
1935       // Hmm should this move to the slow path code area???
1936 
1937       // Test if the oopMark is an obvious stack pointer, i.e.,
1938       //  1) (mark & 3) == 0, and
1939       //  2) rsp <= mark < mark + os::pagesize()
1940       // These 3 tests can be done by evaluating the following
1941       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
1942       // assuming both stack pointer and pagesize have their
1943       // least significant 2 bits clear.
1944       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
1945 
1946       __ subptr(swap_reg, rsp);
1947       __ andptr(swap_reg, 3 - os::vm_page_size());
1948 
1949       // Save the test result, for recursive case, the result is zero
1950       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
1951       __ jcc(Assembler::notEqual, slow_path_lock);
1952     } else {
1953       __ jmp(slow_path_lock);
1954     }
1955 
1956     // Slow path will re-enter here
1957 
1958     __ bind(lock_done);
1959   }
1960 
1961   // Finally just about ready to make the JNI call
1962 
1963   // get JNIEnv* which is first argument to native
1964   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
1965 
1966   // Now set thread in native
1967   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
1968 
1969   __ call(RuntimeAddress(native_func));
1970 
1971   // Verify or restore cpu control state after JNI call
1972   __ restore_cpu_control_state_after_jni();
1973 
1974   // Unpack native results.
1975   switch (ret_type) {
1976   case T_BOOLEAN: __ c2bool(rax);            break;
1977   case T_CHAR   : __ movzwl(rax, rax);      break;
1978   case T_BYTE   : __ sign_extend_byte (rax); break;
1979   case T_SHORT  : __ sign_extend_short(rax); break;
1980   case T_INT    : /* nothing to do */        break;
1981   case T_DOUBLE :
1982   case T_FLOAT  :
1983     // Result is in xmm0 we'll save as needed
1984     break;
1985   case T_ARRAY:                 // Really a handle
1986   case T_OBJECT:                // Really a handle
1987       break; // can't de-handlize until after safepoint check
1988   case T_VOID: break;
1989   case T_LONG: break;
1990   default       : ShouldNotReachHere();
1991   }
1992 
1993   Label after_transition;
1994 
1995   // Switch thread to "native transition" state before reading the synchronization state.
1996   // This additional state is necessary because reading and testing the synchronization
1997   // state is not atomic w.r.t. GC, as this scenario demonstrates:
1998   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
1999   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2000   //     Thread A is resumed to finish this native method, but doesn't block here since it
2001   //     didn't see any synchronization is progress, and escapes.
2002   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2003 
2004   // Force this write out before the read below
2005   __ membar(Assembler::Membar_mask_bits(
2006               Assembler::LoadLoad | Assembler::LoadStore |
2007               Assembler::StoreLoad | Assembler::StoreStore));
2008 
2009   // check for safepoint operation in progress and/or pending suspend requests
2010   {
2011     Label Continue;
2012     Label slow_path;
2013 
2014     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2015 
2016     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2017     __ jcc(Assembler::equal, Continue);
2018     __ bind(slow_path);
2019 
2020     // Don't use call_VM as it will see a possible pending exception and forward it
2021     // and never return here preventing us from clearing _last_native_pc down below.
2022     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2023     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2024     // by hand.
2025     //
2026     __ vzeroupper();
2027     save_native_result(masm, ret_type, stack_slots);
2028     __ mov(c_rarg0, r15_thread);
2029     __ mov(r12, rsp); // remember sp
2030     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2031     __ andptr(rsp, -16); // align stack as required by ABI
2032     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2033     __ mov(rsp, r12); // restore sp
2034     __ reinit_heapbase();
2035     // Restore any method result value
2036     restore_native_result(masm, ret_type, stack_slots);
2037     __ bind(Continue);
2038   }
2039 
2040   // change thread state
2041   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2042   __ bind(after_transition);
2043 
2044   Label reguard;
2045   Label reguard_done;
2046   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2047   __ jcc(Assembler::equal, reguard);
2048   __ bind(reguard_done);
2049 
2050   // native result if any is live
2051 
2052   // Unlock
2053   Label unlock_done;
2054   Label slow_path_unlock;
2055   if (method->is_synchronized()) {
2056 
2057     // Get locked oop from the handle we passed to jni
2058     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2059 
2060     Label done;
2061 
2062     if (!UseHeavyMonitors) {
2063       // Simple recursive lock?
2064       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2065       __ jcc(Assembler::equal, done);
2066     }
2067 
2068     // Must save rax if it is live now because cmpxchg must use it
2069     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2070       save_native_result(masm, ret_type, stack_slots);
2071     }
2072 
2073 
2074     if (!UseHeavyMonitors) {
2075       // get address of the stack lock
2076       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2077       //  get old displaced header
2078       __ movptr(old_hdr, Address(rax, 0));
2079 
2080       // Atomic swap old header if oop still contains the stack lock
2081       __ lock();
2082       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2083       __ jcc(Assembler::notEqual, slow_path_unlock);
2084     } else {
2085       __ jmp(slow_path_unlock);
2086     }
2087 
2088     // slow path re-enters here
2089     __ bind(unlock_done);
2090     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2091       restore_native_result(masm, ret_type, stack_slots);
2092     }
2093 
2094     __ bind(done);
2095 
2096   }
2097   {
2098     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2099     save_native_result(masm, ret_type, stack_slots);
2100     __ mov_metadata(c_rarg1, method());
2101     __ call_VM_leaf(
2102          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2103          r15_thread, c_rarg1);
2104     restore_native_result(masm, ret_type, stack_slots);
2105   }
2106 
2107   __ reset_last_Java_frame(false);
2108 
2109   // Unbox oop result, e.g. JNIHandles::resolve value.
2110   if (is_reference_type(ret_type)) {
2111     __ resolve_jobject(rax /* value */,
2112                        r15_thread /* thread */,
2113                        rcx /* tmp */);
2114   }
2115 
2116   if (CheckJNICalls) {
2117     // clear_pending_jni_exception_check
2118     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2119   }
2120 
2121   // reset handle block
2122   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2123   __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2124 
2125   // pop our frame
2126 
2127   __ leave();
2128 
2129   // Any exception pending?
2130   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2131   __ jcc(Assembler::notEqual, exception_pending);
2132 
2133   // Return
2134 
2135   __ ret(0);
2136 
2137   // Unexpected paths are out of line and go here
2138 
2139   // forward the exception
2140   __ bind(exception_pending);
2141 
2142   // and forward the exception
2143   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2144 
2145   // Slow path locking & unlocking
2146   if (method->is_synchronized()) {
2147 
2148     // BEGIN Slow path lock
2149     __ bind(slow_path_lock);
2150 
2151     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2152     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2153 
2154     // protect the args we've loaded
2155     save_args(masm, total_c_args, c_arg, out_regs);
2156 
2157     __ mov(c_rarg0, obj_reg);
2158     __ mov(c_rarg1, lock_reg);
2159     __ mov(c_rarg2, r15_thread);
2160 
2161     // Not a leaf but we have last_Java_frame setup as we want
2162     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2163     restore_args(masm, total_c_args, c_arg, out_regs);
2164 
2165 #ifdef ASSERT
2166     { Label L;
2167     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2168     __ jcc(Assembler::equal, L);
2169     __ stop("no pending exception allowed on exit from monitorenter");
2170     __ bind(L);
2171     }
2172 #endif
2173     __ jmp(lock_done);
2174 
2175     // END Slow path lock
2176 
2177     // BEGIN Slow path unlock
2178     __ bind(slow_path_unlock);
2179 
2180     // If we haven't already saved the native result we must save it now as xmm registers
2181     // are still exposed.
2182     __ vzeroupper();
2183     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2184       save_native_result(masm, ret_type, stack_slots);
2185     }
2186 
2187     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2188 
2189     __ mov(c_rarg0, obj_reg);
2190     __ mov(c_rarg2, r15_thread);
2191     __ mov(r12, rsp); // remember sp
2192     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2193     __ andptr(rsp, -16); // align stack as required by ABI
2194 
2195     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2196     // NOTE that obj_reg == rbx currently
2197     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2198     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2199 
2200     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2201     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2202     __ mov(rsp, r12); // restore sp
2203     __ reinit_heapbase();
2204 #ifdef ASSERT
2205     {
2206       Label L;
2207       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2208       __ jcc(Assembler::equal, L);
2209       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2210       __ bind(L);
2211     }
2212 #endif /* ASSERT */
2213 
2214     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2215 
2216     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2217       restore_native_result(masm, ret_type, stack_slots);
2218     }
2219     __ jmp(unlock_done);
2220 
2221     // END Slow path unlock
2222 
2223   } // synchronized
2224 
2225   // SLOW PATH Reguard the stack if needed
2226 
2227   __ bind(reguard);
2228   __ vzeroupper();
2229   save_native_result(masm, ret_type, stack_slots);
2230   __ mov(r12, rsp); // remember sp
2231   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2232   __ andptr(rsp, -16); // align stack as required by ABI
2233   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2234   __ mov(rsp, r12); // restore sp
2235   __ reinit_heapbase();
2236   restore_native_result(masm, ret_type, stack_slots);
2237   // and continue
2238   __ jmp(reguard_done);
2239 
2240 
2241 
2242   __ flush();
2243 
2244   nmethod *nm = nmethod::new_native_nmethod(method,
2245                                             compile_id,
2246                                             masm->code(),
2247                                             vep_offset,
2248                                             frame_complete,
2249                                             stack_slots / VMRegImpl::slots_per_word,
2250                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2251                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2252                                             oop_maps);
2253 
2254   return nm;
2255 }
2256 
2257 // this function returns the adjust size (in number of words) to a c2i adapter
2258 // activation for use during deoptimization
2259 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2260   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2261 }
2262 
2263 
2264 uint SharedRuntime::out_preserve_stack_slots() {
2265   return 0;
2266 }
2267 
2268 
2269 // Number of stack slots between incoming argument block and the start of
2270 // a new frame.  The PROLOG must add this many slots to the stack.  The
2271 // EPILOG must remove this many slots.  amd64 needs two slots for
2272 // return address.
2273 uint SharedRuntime::in_preserve_stack_slots() {
2274   return 4 + 2 * VerifyStackAtCalls;
2275 }
2276 
2277 //------------------------------generate_deopt_blob----------------------------
2278 void SharedRuntime::generate_deopt_blob() {
2279   // Allocate space for the code
2280   ResourceMark rm;
2281   // Setup code generation tools
2282   int pad = 0;
2283   if (UseAVX > 2) {
2284     pad += 1024;
2285   }
2286 #if INCLUDE_JVMCI
2287   if (EnableJVMCI) {
2288     pad += 512; // Increase the buffer size when compiling for JVMCI
2289   }
2290 #endif
2291   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2292   MacroAssembler* masm = new MacroAssembler(&buffer);
2293   int frame_size_in_words;
2294   OopMap* map = NULL;
2295   OopMapSet *oop_maps = new OopMapSet();
2296 
2297   // -------------
2298   // This code enters when returning to a de-optimized nmethod.  A return
2299   // address has been pushed on the the stack, and return values are in
2300   // registers.
2301   // If we are doing a normal deopt then we were called from the patched
2302   // nmethod from the point we returned to the nmethod. So the return
2303   // address on the stack is wrong by NativeCall::instruction_size
2304   // We will adjust the value so it looks like we have the original return
2305   // address on the stack (like when we eagerly deoptimized).
2306   // In the case of an exception pending when deoptimizing, we enter
2307   // with a return address on the stack that points after the call we patched
2308   // into the exception handler. We have the following register state from,
2309   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2310   //    rax: exception oop
2311   //    rbx: exception handler
2312   //    rdx: throwing pc
2313   // So in this case we simply jam rdx into the useless return address and
2314   // the stack looks just like we want.
2315   //
2316   // At this point we need to de-opt.  We save the argument return
2317   // registers.  We call the first C routine, fetch_unroll_info().  This
2318   // routine captures the return values and returns a structure which
2319   // describes the current frame size and the sizes of all replacement frames.
2320   // The current frame is compiled code and may contain many inlined
2321   // functions, each with their own JVM state.  We pop the current frame, then
2322   // push all the new frames.  Then we call the C routine unpack_frames() to
2323   // populate these frames.  Finally unpack_frames() returns us the new target
2324   // address.  Notice that callee-save registers are BLOWN here; they have
2325   // already been captured in the vframeArray at the time the return PC was
2326   // patched.
2327   address start = __ pc();
2328   Label cont;
2329 
2330   // Prolog for non exception case!
2331 
2332   // Save everything in sight.
2333   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2334 
2335   // Normal deoptimization.  Save exec mode for unpack_frames.
2336   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2337   __ jmp(cont);
2338 
2339   int reexecute_offset = __ pc() - start;
2340 #if INCLUDE_JVMCI && !defined(COMPILER1)
2341   if (EnableJVMCI && UseJVMCICompiler) {
2342     // JVMCI does not use this kind of deoptimization
2343     __ should_not_reach_here();
2344   }
2345 #endif
2346 
2347   // Reexecute case
2348   // return address is the pc describes what bci to do re-execute at
2349 
2350   // No need to update map as each call to save_live_registers will produce identical oopmap
2351   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2352 
2353   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2354   __ jmp(cont);
2355 
2356 #if INCLUDE_JVMCI
2357   Label after_fetch_unroll_info_call;
2358   int implicit_exception_uncommon_trap_offset = 0;
2359   int uncommon_trap_offset = 0;
2360 
2361   if (EnableJVMCI) {
2362     implicit_exception_uncommon_trap_offset = __ pc() - start;
2363 
2364     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2365     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2366 
2367     uncommon_trap_offset = __ pc() - start;
2368 
2369     // Save everything in sight.
2370     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2371     // fetch_unroll_info needs to call last_java_frame()
2372     __ set_last_Java_frame(noreg, noreg, NULL);
2373 
2374     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2375     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2376 
2377     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2378     __ mov(c_rarg0, r15_thread);
2379     __ movl(c_rarg2, r14); // exec mode
2380     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2381     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2382 
2383     __ reset_last_Java_frame(false);
2384 
2385     __ jmp(after_fetch_unroll_info_call);
2386   } // EnableJVMCI
2387 #endif // INCLUDE_JVMCI
2388 
2389   int exception_offset = __ pc() - start;
2390 
2391   // Prolog for exception case
2392 
2393   // all registers are dead at this entry point, except for rax, and
2394   // rdx which contain the exception oop and exception pc
2395   // respectively.  Set them in TLS and fall thru to the
2396   // unpack_with_exception_in_tls entry point.
2397 
2398   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2399   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2400 
2401   int exception_in_tls_offset = __ pc() - start;
2402 
2403   // new implementation because exception oop is now passed in JavaThread
2404 
2405   // Prolog for exception case
2406   // All registers must be preserved because they might be used by LinearScan
2407   // Exceptiop oop and throwing PC are passed in JavaThread
2408   // tos: stack at point of call to method that threw the exception (i.e. only
2409   // args are on the stack, no return address)
2410 
2411   // make room on stack for the return address
2412   // It will be patched later with the throwing pc. The correct value is not
2413   // available now because loading it from memory would destroy registers.
2414   __ push(0);
2415 
2416   // Save everything in sight.
2417   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2418 
2419   // Now it is safe to overwrite any register
2420 
2421   // Deopt during an exception.  Save exec mode for unpack_frames.
2422   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2423 
2424   // load throwing pc from JavaThread and patch it as the return address
2425   // of the current frame. Then clear the field in JavaThread
2426 
2427   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2428   __ movptr(Address(rbp, wordSize), rdx);
2429   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2430 
2431 #ifdef ASSERT
2432   // verify that there is really an exception oop in JavaThread
2433   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2434   __ verify_oop(rax);
2435 
2436   // verify that there is no pending exception
2437   Label no_pending_exception;
2438   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2439   __ testptr(rax, rax);
2440   __ jcc(Assembler::zero, no_pending_exception);
2441   __ stop("must not have pending exception here");
2442   __ bind(no_pending_exception);
2443 #endif
2444 
2445   __ bind(cont);
2446 
2447   // Call C code.  Need thread and this frame, but NOT official VM entry
2448   // crud.  We cannot block on this call, no GC can happen.
2449   //
2450   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2451 
2452   // fetch_unroll_info needs to call last_java_frame().
2453 
2454   __ set_last_Java_frame(noreg, noreg, NULL);
2455 #ifdef ASSERT
2456   { Label L;
2457     __ cmpptr(Address(r15_thread,
2458                     JavaThread::last_Java_fp_offset()),
2459             (int32_t)0);
2460     __ jcc(Assembler::equal, L);
2461     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2462     __ bind(L);
2463   }
2464 #endif // ASSERT
2465   __ mov(c_rarg0, r15_thread);
2466   __ movl(c_rarg1, r14); // exec_mode
2467   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2468 
2469   // Need to have an oopmap that tells fetch_unroll_info where to
2470   // find any register it might need.
2471   oop_maps->add_gc_map(__ pc() - start, map);
2472 
2473   __ reset_last_Java_frame(false);
2474 
2475 #if INCLUDE_JVMCI
2476   if (EnableJVMCI) {
2477     __ bind(after_fetch_unroll_info_call);
2478   }
2479 #endif
2480 
2481   // Load UnrollBlock* into rdi
2482   __ mov(rdi, rax);
2483 
2484   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2485    Label noException;
2486   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2487   __ jcc(Assembler::notEqual, noException);
2488   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2489   // QQQ this is useless it was NULL above
2490   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2491   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2492   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2493 
2494   __ verify_oop(rax);
2495 
2496   // Overwrite the result registers with the exception results.
2497   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2498   // I think this is useless
2499   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2500 
2501   __ bind(noException);
2502 
2503   // Only register save data is on the stack.
2504   // Now restore the result registers.  Everything else is either dead
2505   // or captured in the vframeArray.
2506   RegisterSaver::restore_result_registers(masm);
2507 
2508   // All of the register save area has been popped of the stack. Only the
2509   // return address remains.
2510 
2511   // Pop all the frames we must move/replace.
2512   //
2513   // Frame picture (youngest to oldest)
2514   // 1: self-frame (no frame link)
2515   // 2: deopting frame  (no frame link)
2516   // 3: caller of deopting frame (could be compiled/interpreted).
2517   //
2518   // Note: by leaving the return address of self-frame on the stack
2519   // and using the size of frame 2 to adjust the stack
2520   // when we are done the return to frame 3 will still be on the stack.
2521 
2522   // Pop deoptimized frame
2523   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2524   __ addptr(rsp, rcx);
2525 
2526   // rsp should be pointing at the return address to the caller (3)
2527 
2528   // Pick up the initial fp we should save
2529   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2530   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2531 
2532 #ifdef ASSERT
2533   // Compilers generate code that bang the stack by as much as the
2534   // interpreter would need. So this stack banging should never
2535   // trigger a fault. Verify that it does not on non product builds.
2536   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2537   __ bang_stack_size(rbx, rcx);
2538 #endif
2539 
2540   // Load address of array of frame pcs into rcx
2541   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2542 
2543   // Trash the old pc
2544   __ addptr(rsp, wordSize);
2545 
2546   // Load address of array of frame sizes into rsi
2547   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2548 
2549   // Load counter into rdx
2550   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2551 
2552   // Now adjust the caller's stack to make up for the extra locals
2553   // but record the original sp so that we can save it in the skeletal interpreter
2554   // frame and the stack walking of interpreter_sender will get the unextended sp
2555   // value and not the "real" sp value.
2556 
2557   const Register sender_sp = r8;
2558 
2559   __ mov(sender_sp, rsp);
2560   __ movl(rbx, Address(rdi,
2561                        Deoptimization::UnrollBlock::
2562                        caller_adjustment_offset_in_bytes()));
2563   __ subptr(rsp, rbx);
2564 
2565   // Push interpreter frames in a loop
2566   Label loop;
2567   __ bind(loop);
2568   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2569   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2570   __ pushptr(Address(rcx, 0));          // Save return address
2571   __ enter();                           // Save old & set new ebp
2572   __ subptr(rsp, rbx);                  // Prolog
2573   // This value is corrected by layout_activation_impl
2574   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2575   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2576   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2577   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2578   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2579   __ decrementl(rdx);                   // Decrement counter
2580   __ jcc(Assembler::notZero, loop);
2581   __ pushptr(Address(rcx, 0));          // Save final return address
2582 
2583   // Re-push self-frame
2584   __ enter();                           // Save old & set new ebp
2585 
2586   // Allocate a full sized register save area.
2587   // Return address and rbp are in place, so we allocate two less words.
2588   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2589 
2590   // Restore frame locals after moving the frame
2591   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2592   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2593 
2594   // Call C code.  Need thread but NOT official VM entry
2595   // crud.  We cannot block on this call, no GC can happen.  Call should
2596   // restore return values to their stack-slots with the new SP.
2597   //
2598   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2599 
2600   // Use rbp because the frames look interpreted now
2601   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2602   // Don't need the precise return PC here, just precise enough to point into this code blob.
2603   address the_pc = __ pc();
2604   __ set_last_Java_frame(noreg, rbp, the_pc);
2605 
2606   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2607   __ mov(c_rarg0, r15_thread);
2608   __ movl(c_rarg1, r14); // second arg: exec_mode
2609   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2610   // Revert SP alignment after call since we're going to do some SP relative addressing below
2611   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2612 
2613   // Set an oopmap for the call site
2614   // Use the same PC we used for the last java frame
2615   oop_maps->add_gc_map(the_pc - start,
2616                        new OopMap( frame_size_in_words, 0 ));
2617 
2618   // Clear fp AND pc
2619   __ reset_last_Java_frame(true);
2620 
2621   // Collect return values
2622   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2623   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2624   // I think this is useless (throwing pc?)
2625   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2626 
2627   // Pop self-frame.
2628   __ leave();                           // Epilog
2629 
2630   // Jump to interpreter
2631   __ ret(0);
2632 
2633   // Make sure all code is generated
2634   masm->flush();
2635 
2636   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2637   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2638 #if INCLUDE_JVMCI
2639   if (EnableJVMCI) {
2640     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2641     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2642   }
2643 #endif
2644 }
2645 
2646 #ifdef COMPILER2
2647 //------------------------------generate_uncommon_trap_blob--------------------
2648 void SharedRuntime::generate_uncommon_trap_blob() {
2649   // Allocate space for the code
2650   ResourceMark rm;
2651   // Setup code generation tools
2652   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2653   MacroAssembler* masm = new MacroAssembler(&buffer);
2654 
2655   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2656 
2657   address start = __ pc();
2658 
2659   if (UseRTMLocking) {
2660     // Abort RTM transaction before possible nmethod deoptimization.
2661     __ xabort(0);
2662   }
2663 
2664   // Push self-frame.  We get here with a return address on the
2665   // stack, so rsp is 8-byte aligned until we allocate our frame.
2666   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2667 
2668   // No callee saved registers. rbp is assumed implicitly saved
2669   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2670 
2671   // compiler left unloaded_class_index in j_rarg0 move to where the
2672   // runtime expects it.
2673   __ movl(c_rarg1, j_rarg0);
2674 
2675   __ set_last_Java_frame(noreg, noreg, NULL);
2676 
2677   // Call C code.  Need thread but NOT official VM entry
2678   // crud.  We cannot block on this call, no GC can happen.  Call should
2679   // capture callee-saved registers as well as return values.
2680   // Thread is in rdi already.
2681   //
2682   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2683 
2684   __ mov(c_rarg0, r15_thread);
2685   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2686   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2687 
2688   // Set an oopmap for the call site
2689   OopMapSet* oop_maps = new OopMapSet();
2690   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2691 
2692   // location of rbp is known implicitly by the frame sender code
2693 
2694   oop_maps->add_gc_map(__ pc() - start, map);
2695 
2696   __ reset_last_Java_frame(false);
2697 
2698   // Load UnrollBlock* into rdi
2699   __ mov(rdi, rax);
2700 
2701 #ifdef ASSERT
2702   { Label L;
2703     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2704             (int32_t)Deoptimization::Unpack_uncommon_trap);
2705     __ jcc(Assembler::equal, L);
2706     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2707     __ bind(L);
2708   }
2709 #endif
2710 
2711   // Pop all the frames we must move/replace.
2712   //
2713   // Frame picture (youngest to oldest)
2714   // 1: self-frame (no frame link)
2715   // 2: deopting frame  (no frame link)
2716   // 3: caller of deopting frame (could be compiled/interpreted).
2717 
2718   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2719   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2720 
2721   // Pop deoptimized frame (int)
2722   __ movl(rcx, Address(rdi,
2723                        Deoptimization::UnrollBlock::
2724                        size_of_deoptimized_frame_offset_in_bytes()));
2725   __ addptr(rsp, rcx);
2726 
2727   // rsp should be pointing at the return address to the caller (3)
2728 
2729   // Pick up the initial fp we should save
2730   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2731   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2732 
2733 #ifdef ASSERT
2734   // Compilers generate code that bang the stack by as much as the
2735   // interpreter would need. So this stack banging should never
2736   // trigger a fault. Verify that it does not on non product builds.
2737   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2738   __ bang_stack_size(rbx, rcx);
2739 #endif
2740 
2741   // Load address of array of frame pcs into rcx (address*)
2742   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2743 
2744   // Trash the return pc
2745   __ addptr(rsp, wordSize);
2746 
2747   // Load address of array of frame sizes into rsi (intptr_t*)
2748   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2749 
2750   // Counter
2751   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2752 
2753   // Now adjust the caller's stack to make up for the extra locals but
2754   // record the original sp so that we can save it in the skeletal
2755   // interpreter frame and the stack walking of interpreter_sender
2756   // will get the unextended sp value and not the "real" sp value.
2757 
2758   const Register sender_sp = r8;
2759 
2760   __ mov(sender_sp, rsp);
2761   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2762   __ subptr(rsp, rbx);
2763 
2764   // Push interpreter frames in a loop
2765   Label loop;
2766   __ bind(loop);
2767   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2768   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2769   __ pushptr(Address(rcx, 0));     // Save return address
2770   __ enter();                      // Save old & set new rbp
2771   __ subptr(rsp, rbx);             // Prolog
2772   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2773             sender_sp);            // Make it walkable
2774   // This value is corrected by layout_activation_impl
2775   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2776   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
2777   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
2778   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
2779   __ decrementl(rdx);              // Decrement counter
2780   __ jcc(Assembler::notZero, loop);
2781   __ pushptr(Address(rcx, 0));     // Save final return address
2782 
2783   // Re-push self-frame
2784   __ enter();                 // Save old & set new rbp
2785   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2786                               // Prolog
2787 
2788   // Use rbp because the frames look interpreted now
2789   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2790   // Don't need the precise return PC here, just precise enough to point into this code blob.
2791   address the_pc = __ pc();
2792   __ set_last_Java_frame(noreg, rbp, the_pc);
2793 
2794   // Call C code.  Need thread but NOT official VM entry
2795   // crud.  We cannot block on this call, no GC can happen.  Call should
2796   // restore return values to their stack-slots with the new SP.
2797   // Thread is in rdi already.
2798   //
2799   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2800 
2801   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2802   __ mov(c_rarg0, r15_thread);
2803   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2804   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2805 
2806   // Set an oopmap for the call site
2807   // Use the same PC we used for the last java frame
2808   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2809 
2810   // Clear fp AND pc
2811   __ reset_last_Java_frame(true);
2812 
2813   // Pop self-frame.
2814   __ leave();                 // Epilog
2815 
2816   // Jump to interpreter
2817   __ ret(0);
2818 
2819   // Make sure all code is generated
2820   masm->flush();
2821 
2822   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
2823                                                  SimpleRuntimeFrame::framesize >> 1);
2824 }
2825 #endif // COMPILER2
2826 
2827 //------------------------------generate_handler_blob------
2828 //
2829 // Generate a special Compile2Runtime blob that saves all registers,
2830 // and setup oopmap.
2831 //
2832 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
2833   assert(StubRoutines::forward_exception_entry() != NULL,
2834          "must be generated before");
2835 
2836   ResourceMark rm;
2837   OopMapSet *oop_maps = new OopMapSet();
2838   OopMap* map;
2839 
2840   // Allocate space for the code.  Setup code generation tools.
2841   CodeBuffer buffer("handler_blob", 2048, 1024);
2842   MacroAssembler* masm = new MacroAssembler(&buffer);
2843 
2844   address start   = __ pc();
2845   address call_pc = NULL;
2846   int frame_size_in_words;
2847   bool cause_return = (poll_type == POLL_AT_RETURN);
2848   bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
2849 
2850   if (UseRTMLocking) {
2851     // Abort RTM transaction before calling runtime
2852     // because critical section will be large and will be
2853     // aborted anyway. Also nmethod could be deoptimized.
2854     __ xabort(0);
2855   }
2856 
2857   // Make room for return address (or push it again)
2858   if (!cause_return) {
2859     __ push(rbx);
2860   }
2861 
2862   // Save registers, fpu state, and flags
2863   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
2864 
2865   // The following is basically a call_VM.  However, we need the precise
2866   // address of the call in order to generate an oopmap. Hence, we do all the
2867   // work ourselves.
2868 
2869   __ set_last_Java_frame(noreg, noreg, NULL);
2870 
2871   // The return address must always be correct so that frame constructor never
2872   // sees an invalid pc.
2873 
2874   if (!cause_return) {
2875     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2876     // Additionally, rbx is a callee saved register and we can look at it later to determine
2877     // if someone changed the return address for us!
2878     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
2879     __ movptr(Address(rbp, wordSize), rbx);
2880   }
2881 
2882   // Do the call
2883   __ mov(c_rarg0, r15_thread);
2884   __ call(RuntimeAddress(call_ptr));
2885 
2886   // Set an oopmap for the call site.  This oopmap will map all
2887   // oop-registers and debug-info registers as callee-saved.  This
2888   // will allow deoptimization at this safepoint to find all possible
2889   // debug-info recordings, as well as let GC find all oops.
2890 
2891   oop_maps->add_gc_map( __ pc() - start, map);
2892 
2893   Label noException;
2894 
2895   __ reset_last_Java_frame(false);
2896 
2897   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
2898   __ jcc(Assembler::equal, noException);
2899 
2900   // Exception pending
2901 
2902   RegisterSaver::restore_live_registers(masm, save_vectors);
2903 
2904   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2905 
2906   // No exception case
2907   __ bind(noException);
2908 
2909   Label no_adjust;
2910 #ifdef ASSERT
2911   Label bail;
2912 #endif
2913   if (!cause_return) {
2914     Label no_prefix, not_special;
2915 
2916     // If our stashed return pc was modified by the runtime we avoid touching it
2917     __ cmpptr(rbx, Address(rbp, wordSize));
2918     __ jccb(Assembler::notEqual, no_adjust);
2919 
2920     // Skip over the poll instruction.
2921     // See NativeInstruction::is_safepoint_poll()
2922     // Possible encodings:
2923     //      85 00       test   %eax,(%rax)
2924     //      85 01       test   %eax,(%rcx)
2925     //      85 02       test   %eax,(%rdx)
2926     //      85 03       test   %eax,(%rbx)
2927     //      85 06       test   %eax,(%rsi)
2928     //      85 07       test   %eax,(%rdi)
2929     //
2930     //   41 85 00       test   %eax,(%r8)
2931     //   41 85 01       test   %eax,(%r9)
2932     //   41 85 02       test   %eax,(%r10)
2933     //   41 85 03       test   %eax,(%r11)
2934     //   41 85 06       test   %eax,(%r14)
2935     //   41 85 07       test   %eax,(%r15)
2936     //
2937     //      85 04 24    test   %eax,(%rsp)
2938     //   41 85 04 24    test   %eax,(%r12)
2939     //      85 45 00    test   %eax,0x0(%rbp)
2940     //   41 85 45 00    test   %eax,0x0(%r13)
2941 
2942     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
2943     __ jcc(Assembler::notEqual, no_prefix);
2944     __ addptr(rbx, 1);
2945     __ bind(no_prefix);
2946 #ifdef ASSERT
2947     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
2948 #endif
2949     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
2950     // r12/rsp 0x04
2951     // r13/rbp 0x05
2952     __ movzbq(rcx, Address(rbx, 1));
2953     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
2954     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
2955     __ cmpptr(rcx, 1);
2956     __ jcc(Assembler::above, not_special);
2957     __ addptr(rbx, 1);
2958     __ bind(not_special);
2959 #ifdef ASSERT
2960     // Verify the correct encoding of the poll we're about to skip.
2961     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
2962     __ jcc(Assembler::notEqual, bail);
2963     // Mask out the modrm bits
2964     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
2965     // rax encodes to 0, so if the bits are nonzero it's incorrect
2966     __ jcc(Assembler::notZero, bail);
2967 #endif
2968     // Adjust return pc forward to step over the safepoint poll instruction
2969     __ addptr(rbx, 2);
2970     __ movptr(Address(rbp, wordSize), rbx);
2971   }
2972 
2973   __ bind(no_adjust);
2974   // Normal exit, restore registers and exit.
2975   RegisterSaver::restore_live_registers(masm, save_vectors);
2976   __ ret(0);
2977 
2978 #ifdef ASSERT
2979   __ bind(bail);
2980   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
2981 #endif
2982 
2983   // Make sure all code is generated
2984   masm->flush();
2985 
2986   // Fill-out other meta info
2987   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
2988 }
2989 
2990 //
2991 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
2992 //
2993 // Generate a stub that calls into vm to find out the proper destination
2994 // of a java call. All the argument registers are live at this point
2995 // but since this is generic code we don't know what they are and the caller
2996 // must do any gc of the args.
2997 //
2998 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
2999   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3000 
3001   // allocate space for the code
3002   ResourceMark rm;
3003 
3004   CodeBuffer buffer(name, 1200, 512);
3005   MacroAssembler* masm                = new MacroAssembler(&buffer);
3006 
3007   int frame_size_in_words;
3008 
3009   OopMapSet *oop_maps = new OopMapSet();
3010   OopMap* map = NULL;
3011 
3012   int start = __ offset();
3013 
3014   // No need to save vector registers since they are caller-saved anyway.
3015   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
3016 
3017   int frame_complete = __ offset();
3018 
3019   __ set_last_Java_frame(noreg, noreg, NULL);
3020 
3021   __ mov(c_rarg0, r15_thread);
3022 
3023   __ call(RuntimeAddress(destination));
3024 
3025 
3026   // Set an oopmap for the call site.
3027   // We need this not only for callee-saved registers, but also for volatile
3028   // registers that the compiler might be keeping live across a safepoint.
3029 
3030   oop_maps->add_gc_map( __ offset() - start, map);
3031 
3032   // rax contains the address we are going to jump to assuming no exception got installed
3033 
3034   // clear last_Java_sp
3035   __ reset_last_Java_frame(false);
3036   // check for pending exceptions
3037   Label pending;
3038   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3039   __ jcc(Assembler::notEqual, pending);
3040 
3041   // get the returned Method*
3042   __ get_vm_result_2(rbx, r15_thread);
3043   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3044 
3045   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3046 
3047   RegisterSaver::restore_live_registers(masm);
3048 
3049   // We are back the the original state on entry and ready to go.
3050 
3051   __ jmp(rax);
3052 
3053   // Pending exception after the safepoint
3054 
3055   __ bind(pending);
3056 
3057   RegisterSaver::restore_live_registers(masm);
3058 
3059   // exception pending => remove activation and forward to exception handler
3060 
3061   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3062 
3063   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3064   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3065 
3066   // -------------
3067   // make sure all code is generated
3068   masm->flush();
3069 
3070   // return the  blob
3071   // frame_size_words or bytes??
3072   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3073 }
3074 
3075 //------------------------------Montgomery multiplication------------------------
3076 //
3077 
3078 #ifndef _WINDOWS
3079 
3080 // Subtract 0:b from carry:a.  Return carry.
3081 static julong
3082 sub(julong a[], julong b[], julong carry, long len) {
3083   long long i = 0, cnt = len;
3084   julong tmp;
3085   asm volatile("clc; "
3086                "0: ; "
3087                "mov (%[b], %[i], 8), %[tmp]; "
3088                "sbb %[tmp], (%[a], %[i], 8); "
3089                "inc %[i]; dec %[cnt]; "
3090                "jne 0b; "
3091                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3092                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3093                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3094                : "memory");
3095   return tmp;
3096 }
3097 
3098 // Multiply (unsigned) Long A by Long B, accumulating the double-
3099 // length result into the accumulator formed of T0, T1, and T2.
3100 #define MACC(A, B, T0, T1, T2)                                  \
3101 do {                                                            \
3102   unsigned long hi, lo;                                         \
3103   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3104            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3105            : "r"(A), "a"(B) : "cc");                            \
3106  } while(0)
3107 
3108 // As above, but add twice the double-length result into the
3109 // accumulator.
3110 #define MACC2(A, B, T0, T1, T2)                                 \
3111 do {                                                            \
3112   unsigned long hi, lo;                                         \
3113   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3114            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3115            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3116            : "r"(A), "a"(B) : "cc");                            \
3117  } while(0)
3118 
3119 #else //_WINDOWS
3120 
3121 static julong
3122 sub(julong a[], julong b[], julong carry, long len) {
3123   long i;
3124   julong tmp;
3125   unsigned char c = 1;
3126   for (i = 0; i < len; i++) {
3127     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3128     a[i] = tmp;
3129   }
3130   c = _addcarry_u64(c, carry, ~0, &tmp);
3131   return tmp;
3132 }
3133 
3134 // Multiply (unsigned) Long A by Long B, accumulating the double-
3135 // length result into the accumulator formed of T0, T1, and T2.
3136 #define MACC(A, B, T0, T1, T2)                          \
3137 do {                                                    \
3138   julong hi, lo;                            \
3139   lo = _umul128(A, B, &hi);                             \
3140   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3141   c = _addcarry_u64(c, hi, T1, &T1);                    \
3142   _addcarry_u64(c, T2, 0, &T2);                         \
3143  } while(0)
3144 
3145 // As above, but add twice the double-length result into the
3146 // accumulator.
3147 #define MACC2(A, B, T0, T1, T2)                         \
3148 do {                                                    \
3149   julong hi, lo;                            \
3150   lo = _umul128(A, B, &hi);                             \
3151   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3152   c = _addcarry_u64(c, hi, T1, &T1);                    \
3153   _addcarry_u64(c, T2, 0, &T2);                         \
3154   c = _addcarry_u64(0, lo, T0, &T0);                    \
3155   c = _addcarry_u64(c, hi, T1, &T1);                    \
3156   _addcarry_u64(c, T2, 0, &T2);                         \
3157  } while(0)
3158 
3159 #endif //_WINDOWS
3160 
3161 // Fast Montgomery multiplication.  The derivation of the algorithm is
3162 // in  A Cryptographic Library for the Motorola DSP56000,
3163 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3164 
3165 static void NOINLINE
3166 montgomery_multiply(julong a[], julong b[], julong n[],
3167                     julong m[], julong inv, int len) {
3168   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3169   int i;
3170 
3171   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3172 
3173   for (i = 0; i < len; i++) {
3174     int j;
3175     for (j = 0; j < i; j++) {
3176       MACC(a[j], b[i-j], t0, t1, t2);
3177       MACC(m[j], n[i-j], t0, t1, t2);
3178     }
3179     MACC(a[i], b[0], t0, t1, t2);
3180     m[i] = t0 * inv;
3181     MACC(m[i], n[0], t0, t1, t2);
3182 
3183     assert(t0 == 0, "broken Montgomery multiply");
3184 
3185     t0 = t1; t1 = t2; t2 = 0;
3186   }
3187 
3188   for (i = len; i < 2*len; i++) {
3189     int j;
3190     for (j = i-len+1; j < len; j++) {
3191       MACC(a[j], b[i-j], t0, t1, t2);
3192       MACC(m[j], n[i-j], t0, t1, t2);
3193     }
3194     m[i-len] = t0;
3195     t0 = t1; t1 = t2; t2 = 0;
3196   }
3197 
3198   while (t0)
3199     t0 = sub(m, n, t0, len);
3200 }
3201 
3202 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3203 // multiplies so it should be up to 25% faster than Montgomery
3204 // multiplication.  However, its loop control is more complex and it
3205 // may actually run slower on some machines.
3206 
3207 static void NOINLINE
3208 montgomery_square(julong a[], julong n[],
3209                   julong m[], julong inv, int len) {
3210   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3211   int i;
3212 
3213   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3214 
3215   for (i = 0; i < len; i++) {
3216     int j;
3217     int end = (i+1)/2;
3218     for (j = 0; j < end; j++) {
3219       MACC2(a[j], a[i-j], t0, t1, t2);
3220       MACC(m[j], n[i-j], t0, t1, t2);
3221     }
3222     if ((i & 1) == 0) {
3223       MACC(a[j], a[j], t0, t1, t2);
3224     }
3225     for (; j < i; j++) {
3226       MACC(m[j], n[i-j], t0, t1, t2);
3227     }
3228     m[i] = t0 * inv;
3229     MACC(m[i], n[0], t0, t1, t2);
3230 
3231     assert(t0 == 0, "broken Montgomery square");
3232 
3233     t0 = t1; t1 = t2; t2 = 0;
3234   }
3235 
3236   for (i = len; i < 2*len; i++) {
3237     int start = i-len+1;
3238     int end = start + (len - start)/2;
3239     int j;
3240     for (j = start; j < end; j++) {
3241       MACC2(a[j], a[i-j], t0, t1, t2);
3242       MACC(m[j], n[i-j], t0, t1, t2);
3243     }
3244     if ((i & 1) == 0) {
3245       MACC(a[j], a[j], t0, t1, t2);
3246     }
3247     for (; j < len; j++) {
3248       MACC(m[j], n[i-j], t0, t1, t2);
3249     }
3250     m[i-len] = t0;
3251     t0 = t1; t1 = t2; t2 = 0;
3252   }
3253 
3254   while (t0)
3255     t0 = sub(m, n, t0, len);
3256 }
3257 
3258 // Swap words in a longword.
3259 static julong swap(julong x) {
3260   return (x << 32) | (x >> 32);
3261 }
3262 
3263 // Copy len longwords from s to d, word-swapping as we go.  The
3264 // destination array is reversed.
3265 static void reverse_words(julong *s, julong *d, int len) {
3266   d += len;
3267   while(len-- > 0) {
3268     d--;
3269     *d = swap(*s);
3270     s++;
3271   }
3272 }
3273 
3274 // The threshold at which squaring is advantageous was determined
3275 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3276 #define MONTGOMERY_SQUARING_THRESHOLD 64
3277 
3278 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3279                                         jint len, jlong inv,
3280                                         jint *m_ints) {
3281   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3282   int longwords = len/2;
3283 
3284   // Make very sure we don't use so much space that the stack might
3285   // overflow.  512 jints corresponds to an 16384-bit integer and
3286   // will use here a total of 8k bytes of stack space.
3287   int divisor = sizeof(julong) * 4;
3288   guarantee(longwords <= 8192 / divisor, "must be");
3289   int total_allocation = longwords * sizeof (julong) * 4;
3290   julong *scratch = (julong *)alloca(total_allocation);
3291 
3292   // Local scratch arrays
3293   julong
3294     *a = scratch + 0 * longwords,
3295     *b = scratch + 1 * longwords,
3296     *n = scratch + 2 * longwords,
3297     *m = scratch + 3 * longwords;
3298 
3299   reverse_words((julong *)a_ints, a, longwords);
3300   reverse_words((julong *)b_ints, b, longwords);
3301   reverse_words((julong *)n_ints, n, longwords);
3302 
3303   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3304 
3305   reverse_words(m, (julong *)m_ints, longwords);
3306 }
3307 
3308 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3309                                       jint len, jlong inv,
3310                                       jint *m_ints) {
3311   assert(len % 2 == 0, "array length in montgomery_square must be even");
3312   int longwords = len/2;
3313 
3314   // Make very sure we don't use so much space that the stack might
3315   // overflow.  512 jints corresponds to an 16384-bit integer and
3316   // will use here a total of 6k bytes of stack space.
3317   int divisor = sizeof(julong) * 3;
3318   guarantee(longwords <= (8192 / divisor), "must be");
3319   int total_allocation = longwords * sizeof (julong) * 3;
3320   julong *scratch = (julong *)alloca(total_allocation);
3321 
3322   // Local scratch arrays
3323   julong
3324     *a = scratch + 0 * longwords,
3325     *n = scratch + 1 * longwords,
3326     *m = scratch + 2 * longwords;
3327 
3328   reverse_words((julong *)a_ints, a, longwords);
3329   reverse_words((julong *)n_ints, n, longwords);
3330 
3331   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3332     ::montgomery_square(a, n, m, (julong)inv, longwords);
3333   } else {
3334     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3335   }
3336 
3337   reverse_words(m, (julong *)m_ints, longwords);
3338 }
3339 
3340 #ifdef COMPILER2
3341 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3342 //
3343 //------------------------------generate_exception_blob---------------------------
3344 // creates exception blob at the end
3345 // Using exception blob, this code is jumped from a compiled method.
3346 // (see emit_exception_handler in x86_64.ad file)
3347 //
3348 // Given an exception pc at a call we call into the runtime for the
3349 // handler in this method. This handler might merely restore state
3350 // (i.e. callee save registers) unwind the frame and jump to the
3351 // exception handler for the nmethod if there is no Java level handler
3352 // for the nmethod.
3353 //
3354 // This code is entered with a jmp.
3355 //
3356 // Arguments:
3357 //   rax: exception oop
3358 //   rdx: exception pc
3359 //
3360 // Results:
3361 //   rax: exception oop
3362 //   rdx: exception pc in caller or ???
3363 //   destination: exception handler of caller
3364 //
3365 // Note: the exception pc MUST be at a call (precise debug information)
3366 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3367 //
3368 
3369 void OptoRuntime::generate_exception_blob() {
3370   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3371   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3372   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3373 
3374   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3375 
3376   // Allocate space for the code
3377   ResourceMark rm;
3378   // Setup code generation tools
3379   CodeBuffer buffer("exception_blob", 2048, 1024);
3380   MacroAssembler* masm = new MacroAssembler(&buffer);
3381 
3382 
3383   address start = __ pc();
3384 
3385   // Exception pc is 'return address' for stack walker
3386   __ push(rdx);
3387   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3388 
3389   // Save callee-saved registers.  See x86_64.ad.
3390 
3391   // rbp is an implicitly saved callee saved register (i.e., the calling
3392   // convention will save/restore it in the prolog/epilog). Other than that
3393   // there are no callee save registers now that adapter frames are gone.
3394 
3395   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3396 
3397   // Store exception in Thread object. We cannot pass any arguments to the
3398   // handle_exception call, since we do not want to make any assumption
3399   // about the size of the frame where the exception happened in.
3400   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3401   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3402   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3403 
3404   // This call does all the hard work.  It checks if an exception handler
3405   // exists in the method.
3406   // If so, it returns the handler address.
3407   // If not, it prepares for stack-unwinding, restoring the callee-save
3408   // registers of the frame being removed.
3409   //
3410   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3411 
3412   // At a method handle call, the stack may not be properly aligned
3413   // when returning with an exception.
3414   address the_pc = __ pc();
3415   __ set_last_Java_frame(noreg, noreg, the_pc);
3416   __ mov(c_rarg0, r15_thread);
3417   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3418   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3419 
3420   // Set an oopmap for the call site.  This oopmap will only be used if we
3421   // are unwinding the stack.  Hence, all locations will be dead.
3422   // Callee-saved registers will be the same as the frame above (i.e.,
3423   // handle_exception_stub), since they were restored when we got the
3424   // exception.
3425 
3426   OopMapSet* oop_maps = new OopMapSet();
3427 
3428   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3429 
3430   __ reset_last_Java_frame(false);
3431 
3432   // Restore callee-saved registers
3433 
3434   // rbp is an implicitly saved callee-saved register (i.e., the calling
3435   // convention will save restore it in prolog/epilog) Other than that
3436   // there are no callee save registers now that adapter frames are gone.
3437 
3438   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3439 
3440   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3441   __ pop(rdx);                  // No need for exception pc anymore
3442 
3443   // rax: exception handler
3444 
3445   // We have a handler in rax (could be deopt blob).
3446   __ mov(r8, rax);
3447 
3448   // Get the exception oop
3449   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3450   // Get the exception pc in case we are deoptimized
3451   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3452 #ifdef ASSERT
3453   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3454   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3455 #endif
3456   // Clear the exception oop so GC no longer processes it as a root.
3457   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3458 
3459   // rax: exception oop
3460   // r8:  exception handler
3461   // rdx: exception pc
3462   // Jump to handler
3463 
3464   __ jmp(r8);
3465 
3466   // Make sure all code is generated
3467   masm->flush();
3468 
3469   // Set exception blob
3470   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3471 }
3472 #endif // COMPILER2
3473