1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/icBuffer.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/disassembler.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "logging/logStream.hpp"
  44 #include "memory/resourceArea.hpp"
  45 #include "memory/universe.hpp"
  46 #include "oops/compiledICHolder.hpp"
  47 #include "oops/klass.inline.hpp"
  48 #include "prims/methodHandles.hpp"
  49 #include "runtime/jniHandles.hpp"
  50 #include "runtime/safepointMechanism.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/signature.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "runtime/vframeArray.hpp"
  55 #include "runtime/vm_version.hpp"
  56 #include "utilities/align.hpp"
  57 #include "utilities/formatBuffer.hpp"
  58 #include "vmreg_x86.inline.hpp"
  59 #ifdef COMPILER1
  60 #include "c1/c1_Runtime1.hpp"
  61 #endif
  62 #ifdef COMPILER2
  63 #include "opto/runtime.hpp"
  64 #endif
  65 #if INCLUDE_JVMCI
  66 #include "jvmci/jvmciJavaClasses.hpp"
  67 #endif
  68 
  69 #define __ masm->
  70 
  71 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  72 
  73 class SimpleRuntimeFrame {
  74 
  75   public:
  76 
  77   // Most of the runtime stubs have this simple frame layout.
  78   // This class exists to make the layout shared in one place.
  79   // Offsets are for compiler stack slots, which are jints.
  80   enum layout {
  81     // The frame sender code expects that rbp will be in the "natural" place and
  82     // will override any oopMap setting for it. We must therefore force the layout
  83     // so that it agrees with the frame sender code.
  84     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  85     rbp_off2,
  86     return_off, return_off2,
  87     framesize
  88   };
  89 };
  90 
  91 class RegisterSaver {
  92   // Capture info about frame layout.  Layout offsets are in jint
  93   // units because compiler frame slots are jints.
  94 #define XSAVE_AREA_BEGIN 160
  95 #define XSAVE_AREA_YMM_BEGIN 576
  96 #define XSAVE_AREA_OPMASK_BEGIN 1088
  97 #define XSAVE_AREA_ZMM_BEGIN 1152
  98 #define XSAVE_AREA_UPPERBANK 1664
  99 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 100 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 101 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 102 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 103 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 104   enum layout {
 105     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 106     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 107     DEF_XMM_OFFS(0),
 108     DEF_XMM_OFFS(1),
 109     // 2..15 are implied in range usage
 110     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 111     DEF_YMM_OFFS(0),
 112     DEF_YMM_OFFS(1),
 113     // 2..15 are implied in range usage
 114     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 115     DEF_OPMASK_OFFS(0),
 116     DEF_OPMASK_OFFS(1),
 117     // 2..7 are implied in range usage
 118     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 119     DEF_ZMM_OFFS(0),
 120     DEF_ZMM_OFFS(1),
 121     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 122     DEF_ZMM_UPPER_OFFS(16),
 123     DEF_ZMM_UPPER_OFFS(17),
 124     // 18..31 are implied in range usage
 125     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 126     fpu_stateH_end,
 127     r15_off, r15H_off,
 128     r14_off, r14H_off,
 129     r13_off, r13H_off,
 130     r12_off, r12H_off,
 131     r11_off, r11H_off,
 132     r10_off, r10H_off,
 133     r9_off,  r9H_off,
 134     r8_off,  r8H_off,
 135     rdi_off, rdiH_off,
 136     rsi_off, rsiH_off,
 137     ignore_off, ignoreH_off,  // extra copy of rbp
 138     rsp_off, rspH_off,
 139     rbx_off, rbxH_off,
 140     rdx_off, rdxH_off,
 141     rcx_off, rcxH_off,
 142     rax_off, raxH_off,
 143     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 144     align_off, alignH_off,
 145     flags_off, flagsH_off,
 146     // The frame sender code expects that rbp will be in the "natural" place and
 147     // will override any oopMap setting for it. We must therefore force the layout
 148     // so that it agrees with the frame sender code.
 149     rbp_off, rbpH_off,        // copy of rbp we will restore
 150     return_off, returnH_off,  // slot for return address
 151     reg_save_size             // size in compiler stack slots
 152   };
 153 
 154  public:
 155   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors);
 156   static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 157 
 158   // Offsets into the register save area
 159   // Used by deoptimization when it is managing result register
 160   // values on its own
 161 
 162   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 163   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 164   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 165   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 166   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 167 
 168   // During deoptimization only the result registers need to be restored,
 169   // all the other values have already been extracted.
 170   static void restore_result_registers(MacroAssembler* masm);
 171 };
 172 
 173 // Register is a class, but it would be assigned numerical value.
 174 // "0" is assigned for rax. Thus we need to ignore -Wnonnull.
 175 PRAGMA_DIAG_PUSH
 176 PRAGMA_NONNULL_IGNORED
 177 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
 178   int off = 0;
 179   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 180   if (UseAVX < 3) {
 181     num_xmm_regs = num_xmm_regs/2;
 182   }
 183 #if COMPILER2_OR_JVMCI
 184   if (save_vectors && UseAVX == 0) {
 185     save_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 186   }
 187   assert(!save_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 188 #else
 189   save_vectors = false; // vectors are generated only by C2 and JVMCI
 190 #endif
 191 
 192   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 193   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 194   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 195   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 196   // CodeBlob frame size is in words.
 197   int frame_size_in_words = frame_size_in_bytes / wordSize;
 198   *total_frame_words = frame_size_in_words;
 199 
 200   // Save registers, fpu state, and flags.
 201   // We assume caller has already pushed the return address onto the
 202   // stack, so rsp is 8-byte aligned here.
 203   // We push rpb twice in this sequence because we want the real rbp
 204   // to be under the return like a normal enter.
 205 
 206   __ enter();          // rsp becomes 16-byte aligned here
 207   __ push_CPU_state(); // Push a multiple of 16 bytes
 208 
 209   // push cpu state handles this on EVEX enabled targets
 210   if (save_vectors) {
 211     // Save upper half of YMM registers(0..15)
 212     int base_addr = XSAVE_AREA_YMM_BEGIN;
 213     for (int n = 0; n < 16; n++) {
 214       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 215     }
 216     if (VM_Version::supports_evex()) {
 217       // Save upper half of ZMM registers(0..15)
 218       base_addr = XSAVE_AREA_ZMM_BEGIN;
 219       for (int n = 0; n < 16; n++) {
 220         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 221       }
 222       // Save full ZMM registers(16..num_xmm_regs)
 223       base_addr = XSAVE_AREA_UPPERBANK;
 224       off = 0;
 225       int vector_len = Assembler::AVX_512bit;
 226       for (int n = 16; n < num_xmm_regs; n++) {
 227         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 228       }
 229 #if COMPILER2_OR_JVMCI
 230       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 231       off = 0;
 232       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 233         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 234       }
 235 #endif
 236     }
 237   } else {
 238     if (VM_Version::supports_evex()) {
 239       // Save upper bank of ZMM registers(16..31) for double/float usage
 240       int base_addr = XSAVE_AREA_UPPERBANK;
 241       off = 0;
 242       for (int n = 16; n < num_xmm_regs; n++) {
 243         __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
 244       }
 245 #if COMPILER2_OR_JVMCI
 246       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 247       off = 0;
 248       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 249         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 250       }
 251 #endif
 252     }
 253   }
 254   __ vzeroupper();
 255   if (frame::arg_reg_save_area_bytes != 0) {
 256     // Allocate argument register save area
 257     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 258   }
 259 
 260   // Set an oopmap for the call site.  This oopmap will map all
 261   // oop-registers and debug-info registers as callee-saved.  This
 262   // will allow deoptimization at this safepoint to find all possible
 263   // debug-info recordings, as well as let GC find all oops.
 264 
 265   OopMapSet *oop_maps = new OopMapSet();
 266   OopMap* map = new OopMap(frame_size_in_slots, 0);
 267 
 268 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 269 
 270   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 271   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 272   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 273   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 274   // rbp location is known implicitly by the frame sender code, needs no oopmap
 275   // and the location where rbp was saved by is ignored
 276   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 283   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 284   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 285   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 286   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 287   // on EVEX enabled targets, we get it included in the xsave area
 288   off = xmm0_off;
 289   int delta = xmm1_off - off;
 290   for (int n = 0; n < 16; n++) {
 291     XMMRegister xmm_name = as_XMMRegister(n);
 292     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 293     off += delta;
 294   }
 295   if (UseAVX > 2) {
 296     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 297     off = zmm16_off;
 298     delta = zmm17_off - off;
 299     for (int n = 16; n < num_xmm_regs; n++) {
 300       XMMRegister zmm_name = as_XMMRegister(n);
 301       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 302       off += delta;
 303     }
 304   }
 305 
 306 #if COMPILER2_OR_JVMCI
 307   if (save_vectors) {
 308     // Save upper half of YMM registers(0..15)
 309     off = ymm0_off;
 310     delta = ymm1_off - ymm0_off;
 311     for (int n = 0; n < 16; n++) {
 312       XMMRegister ymm_name = as_XMMRegister(n);
 313       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 314       off += delta;
 315     }
 316     if (VM_Version::supports_evex()) {
 317       // Save upper half of ZMM registers(0..15)
 318       off = zmm0_off;
 319       delta = zmm1_off - zmm0_off;
 320       for (int n = 0; n < 16; n++) {
 321         XMMRegister zmm_name = as_XMMRegister(n);
 322         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 323         off += delta;
 324       }
 325     }
 326   }
 327 #endif // COMPILER2_OR_JVMCI
 328 
 329   // %%% These should all be a waste but we'll keep things as they were for now
 330   if (true) {
 331     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 335     // rbp location is known implicitly by the frame sender code, needs no oopmap
 336     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 343     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 344     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 345     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 346     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 347     // on EVEX enabled targets, we get it included in the xsave area
 348     off = xmm0H_off;
 349     delta = xmm1H_off - off;
 350     for (int n = 0; n < 16; n++) {
 351       XMMRegister xmm_name = as_XMMRegister(n);
 352       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 353       off += delta;
 354     }
 355     if (UseAVX > 2) {
 356       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 357       off = zmm16H_off;
 358       delta = zmm17H_off - off;
 359       for (int n = 16; n < num_xmm_regs; n++) {
 360         XMMRegister zmm_name = as_XMMRegister(n);
 361         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 362         off += delta;
 363       }
 364     }
 365   }
 366 
 367   return map;
 368 }
 369 PRAGMA_DIAG_POP
 370 
 371 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
 372   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 373   if (UseAVX < 3) {
 374     num_xmm_regs = num_xmm_regs/2;
 375   }
 376   if (frame::arg_reg_save_area_bytes != 0) {
 377     // Pop arg register save area
 378     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 379   }
 380 
 381 #if COMPILER2_OR_JVMCI
 382   if (restore_vectors) {
 383     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 384     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 385   }
 386 #else
 387   assert(!restore_vectors, "vectors are generated only by C2");
 388 #endif
 389 
 390   __ vzeroupper();
 391 
 392   // On EVEX enabled targets everything is handled in pop fpu state
 393   if (restore_vectors) {
 394     // Restore upper half of YMM registers (0..15)
 395     int base_addr = XSAVE_AREA_YMM_BEGIN;
 396     for (int n = 0; n < 16; n++) {
 397       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 398     }
 399     if (VM_Version::supports_evex()) {
 400       // Restore upper half of ZMM registers (0..15)
 401       base_addr = XSAVE_AREA_ZMM_BEGIN;
 402       for (int n = 0; n < 16; n++) {
 403         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 404       }
 405       // Restore full ZMM registers(16..num_xmm_regs)
 406       base_addr = XSAVE_AREA_UPPERBANK;
 407       int vector_len = Assembler::AVX_512bit;
 408       int off = 0;
 409       for (int n = 16; n < num_xmm_regs; n++) {
 410         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 411       }
 412 #if COMPILER2_OR_JVMCI
 413       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 414       off = 0;
 415       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 416         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 417       }
 418 #endif
 419     }
 420   } else {
 421     if (VM_Version::supports_evex()) {
 422       // Restore upper bank of ZMM registers(16..31) for double/float usage
 423       int base_addr = XSAVE_AREA_UPPERBANK;
 424       int off = 0;
 425       for (int n = 16; n < num_xmm_regs; n++) {
 426         __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
 427       }
 428 #if COMPILER2_OR_JVMCI
 429       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 430       off = 0;
 431       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 432         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 433       }
 434 #endif
 435     }
 436   }
 437 
 438   // Recover CPU state
 439   __ pop_CPU_state();
 440   // Get the rbp described implicitly by the calling convention (no oopMap)
 441   __ pop(rbp);
 442 }
 443 
 444 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 445 
 446   // Just restore result register. Only used by deoptimization. By
 447   // now any callee save register that needs to be restored to a c2
 448   // caller of the deoptee has been extracted into the vframeArray
 449   // and will be stuffed into the c2i adapter we create for later
 450   // restoration so only result registers need to be restored here.
 451 
 452   // Restore fp result register
 453   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 454   // Restore integer result register
 455   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 456   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 457 
 458   // Pop all of the register save are off the stack except the return address
 459   __ addptr(rsp, return_offset_in_bytes());
 460 }
 461 
 462 // Is vector's size (in bytes) bigger than a size saved by default?
 463 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 464 bool SharedRuntime::is_wide_vector(int size) {
 465   return size > 16;
 466 }
 467 
 468 // ---------------------------------------------------------------------------
 469 // Read the array of BasicTypes from a signature, and compute where the
 470 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 471 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 472 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 473 // as framesizes are fixed.
 474 // VMRegImpl::stack0 refers to the first slot 0(sp).
 475 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 476 // up to RegisterImpl::number_of_registers) are the 64-bit
 477 // integer registers.
 478 
 479 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 480 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 481 // units regardless of build. Of course for i486 there is no 64 bit build
 482 
 483 // The Java calling convention is a "shifted" version of the C ABI.
 484 // By skipping the first C ABI register we can call non-static jni methods
 485 // with small numbers of arguments without having to shuffle the arguments
 486 // at all. Since we control the java ABI we ought to at least get some
 487 // advantage out of it.
 488 
 489 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 490                                            VMRegPair *regs,
 491                                            int total_args_passed) {
 492 
 493   // Create the mapping between argument positions and
 494   // registers.
 495   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 496     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 497   };
 498   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 499     j_farg0, j_farg1, j_farg2, j_farg3,
 500     j_farg4, j_farg5, j_farg6, j_farg7
 501   };
 502 
 503 
 504   uint int_args = 0;
 505   uint fp_args = 0;
 506   uint stk_args = 0; // inc by 2 each time
 507 
 508   for (int i = 0; i < total_args_passed; i++) {
 509     switch (sig_bt[i]) {
 510     case T_BOOLEAN:
 511     case T_CHAR:
 512     case T_BYTE:
 513     case T_SHORT:
 514     case T_INT:
 515       if (int_args < Argument::n_int_register_parameters_j) {
 516         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 517       } else {
 518         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 519         stk_args += 2;
 520       }
 521       break;
 522     case T_VOID:
 523       // halves of T_LONG or T_DOUBLE
 524       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 525       regs[i].set_bad();
 526       break;
 527     case T_LONG:
 528       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 529       // fall through
 530     case T_OBJECT:
 531     case T_ARRAY:
 532     case T_ADDRESS:
 533       if (int_args < Argument::n_int_register_parameters_j) {
 534         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 535       } else {
 536         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 537         stk_args += 2;
 538       }
 539       break;
 540     case T_FLOAT:
 541       if (fp_args < Argument::n_float_register_parameters_j) {
 542         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 543       } else {
 544         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 545         stk_args += 2;
 546       }
 547       break;
 548     case T_DOUBLE:
 549       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 550       if (fp_args < Argument::n_float_register_parameters_j) {
 551         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 552       } else {
 553         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 554         stk_args += 2;
 555       }
 556       break;
 557     default:
 558       ShouldNotReachHere();
 559       break;
 560     }
 561   }
 562 
 563   return align_up(stk_args, 2);
 564 }
 565 
 566 // Patch the callers callsite with entry to compiled code if it exists.
 567 static void patch_callers_callsite(MacroAssembler *masm) {
 568   Label L;
 569   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 570   __ jcc(Assembler::equal, L);
 571 
 572   // Save the current stack pointer
 573   __ mov(r13, rsp);
 574   // Schedule the branch target address early.
 575   // Call into the VM to patch the caller, then jump to compiled callee
 576   // rax isn't live so capture return address while we easily can
 577   __ movptr(rax, Address(rsp, 0));
 578 
 579   // align stack so push_CPU_state doesn't fault
 580   __ andptr(rsp, -(StackAlignmentInBytes));
 581   __ push_CPU_state();
 582   __ vzeroupper();
 583   // VM needs caller's callsite
 584   // VM needs target method
 585   // This needs to be a long call since we will relocate this adapter to
 586   // the codeBuffer and it may not reach
 587 
 588   // Allocate argument register save area
 589   if (frame::arg_reg_save_area_bytes != 0) {
 590     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 591   }
 592   __ mov(c_rarg0, rbx);
 593   __ mov(c_rarg1, rax);
 594   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 595 
 596   // De-allocate argument register save area
 597   if (frame::arg_reg_save_area_bytes != 0) {
 598     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 599   }
 600 
 601   __ vzeroupper();
 602   __ pop_CPU_state();
 603   // restore sp
 604   __ mov(rsp, r13);
 605   __ bind(L);
 606 }
 607 
 608 
 609 static void gen_c2i_adapter(MacroAssembler *masm,
 610                             int total_args_passed,
 611                             int comp_args_on_stack,
 612                             const BasicType *sig_bt,
 613                             const VMRegPair *regs,
 614                             Label& skip_fixup) {
 615   // Before we get into the guts of the C2I adapter, see if we should be here
 616   // at all.  We've come from compiled code and are attempting to jump to the
 617   // interpreter, which means the caller made a static call to get here
 618   // (vcalls always get a compiled target if there is one).  Check for a
 619   // compiled target.  If there is one, we need to patch the caller's call.
 620   patch_callers_callsite(masm);
 621 
 622   __ bind(skip_fixup);
 623 
 624   // Since all args are passed on the stack, total_args_passed *
 625   // Interpreter::stackElementSize is the space we need. Plus 1 because
 626   // we also account for the return address location since
 627   // we store it first rather than hold it in rax across all the shuffling
 628 
 629   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 630 
 631   // stack is aligned, keep it that way
 632   extraspace = align_up(extraspace, 2*wordSize);
 633 
 634   // Get return address
 635   __ pop(rax);
 636 
 637   // set senderSP value
 638   __ mov(r13, rsp);
 639 
 640   __ subptr(rsp, extraspace);
 641 
 642   // Store the return address in the expected location
 643   __ movptr(Address(rsp, 0), rax);
 644 
 645   // Now write the args into the outgoing interpreter space
 646   for (int i = 0; i < total_args_passed; i++) {
 647     if (sig_bt[i] == T_VOID) {
 648       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 649       continue;
 650     }
 651 
 652     // offset to start parameters
 653     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 654     int next_off = st_off - Interpreter::stackElementSize;
 655 
 656     // Say 4 args:
 657     // i   st_off
 658     // 0   32 T_LONG
 659     // 1   24 T_VOID
 660     // 2   16 T_OBJECT
 661     // 3    8 T_BOOL
 662     // -    0 return address
 663     //
 664     // However to make thing extra confusing. Because we can fit a long/double in
 665     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 666     // leaves one slot empty and only stores to a single slot. In this case the
 667     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 668 
 669     VMReg r_1 = regs[i].first();
 670     VMReg r_2 = regs[i].second();
 671     if (!r_1->is_valid()) {
 672       assert(!r_2->is_valid(), "");
 673       continue;
 674     }
 675     if (r_1->is_stack()) {
 676       // memory to memory use rax
 677       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 678       if (!r_2->is_valid()) {
 679         // sign extend??
 680         __ movl(rax, Address(rsp, ld_off));
 681         __ movptr(Address(rsp, st_off), rax);
 682 
 683       } else {
 684 
 685         __ movq(rax, Address(rsp, ld_off));
 686 
 687         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 688         // T_DOUBLE and T_LONG use two slots in the interpreter
 689         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 690           // ld_off == LSW, ld_off+wordSize == MSW
 691           // st_off == MSW, next_off == LSW
 692           __ movq(Address(rsp, next_off), rax);
 693 #ifdef ASSERT
 694           // Overwrite the unused slot with known junk
 695           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 696           __ movptr(Address(rsp, st_off), rax);
 697 #endif /* ASSERT */
 698         } else {
 699           __ movq(Address(rsp, st_off), rax);
 700         }
 701       }
 702     } else if (r_1->is_Register()) {
 703       Register r = r_1->as_Register();
 704       if (!r_2->is_valid()) {
 705         // must be only an int (or less ) so move only 32bits to slot
 706         // why not sign extend??
 707         __ movl(Address(rsp, st_off), r);
 708       } else {
 709         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 710         // T_DOUBLE and T_LONG use two slots in the interpreter
 711         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 712           // long/double in gpr
 713 #ifdef ASSERT
 714           // Overwrite the unused slot with known junk
 715           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 716           __ movptr(Address(rsp, st_off), rax);
 717 #endif /* ASSERT */
 718           __ movq(Address(rsp, next_off), r);
 719         } else {
 720           __ movptr(Address(rsp, st_off), r);
 721         }
 722       }
 723     } else {
 724       assert(r_1->is_XMMRegister(), "");
 725       if (!r_2->is_valid()) {
 726         // only a float use just part of the slot
 727         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 728       } else {
 729 #ifdef ASSERT
 730         // Overwrite the unused slot with known junk
 731         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 732         __ movptr(Address(rsp, st_off), rax);
 733 #endif /* ASSERT */
 734         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 735       }
 736     }
 737   }
 738 
 739   // Schedule the branch target address early.
 740   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 741   __ jmp(rcx);
 742 }
 743 
 744 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 745                         address code_start, address code_end,
 746                         Label& L_ok) {
 747   Label L_fail;
 748   __ lea(temp_reg, ExternalAddress(code_start));
 749   __ cmpptr(pc_reg, temp_reg);
 750   __ jcc(Assembler::belowEqual, L_fail);
 751   __ lea(temp_reg, ExternalAddress(code_end));
 752   __ cmpptr(pc_reg, temp_reg);
 753   __ jcc(Assembler::below, L_ok);
 754   __ bind(L_fail);
 755 }
 756 
 757 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 758                                     int total_args_passed,
 759                                     int comp_args_on_stack,
 760                                     const BasicType *sig_bt,
 761                                     const VMRegPair *regs) {
 762 
 763   // Note: r13 contains the senderSP on entry. We must preserve it since
 764   // we may do a i2c -> c2i transition if we lose a race where compiled
 765   // code goes non-entrant while we get args ready.
 766   // In addition we use r13 to locate all the interpreter args as
 767   // we must align the stack to 16 bytes on an i2c entry else we
 768   // lose alignment we expect in all compiled code and register
 769   // save code can segv when fxsave instructions find improperly
 770   // aligned stack pointer.
 771 
 772   // Adapters can be frameless because they do not require the caller
 773   // to perform additional cleanup work, such as correcting the stack pointer.
 774   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 775   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 776   // even if a callee has modified the stack pointer.
 777   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 778   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 779   // up via the senderSP register).
 780   // In other words, if *either* the caller or callee is interpreted, we can
 781   // get the stack pointer repaired after a call.
 782   // This is why c2i and i2c adapters cannot be indefinitely composed.
 783   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 784   // both caller and callee would be compiled methods, and neither would
 785   // clean up the stack pointer changes performed by the two adapters.
 786   // If this happens, control eventually transfers back to the compiled
 787   // caller, but with an uncorrected stack, causing delayed havoc.
 788 
 789   // Pick up the return address
 790   __ movptr(rax, Address(rsp, 0));
 791 
 792   if (VerifyAdapterCalls &&
 793       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 794     // So, let's test for cascading c2i/i2c adapters right now.
 795     //  assert(Interpreter::contains($return_addr) ||
 796     //         StubRoutines::contains($return_addr),
 797     //         "i2c adapter must return to an interpreter frame");
 798     __ block_comment("verify_i2c { ");
 799     Label L_ok;
 800     if (Interpreter::code() != NULL)
 801       range_check(masm, rax, r11,
 802                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 803                   L_ok);
 804     if (StubRoutines::code1() != NULL)
 805       range_check(masm, rax, r11,
 806                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 807                   L_ok);
 808     if (StubRoutines::code2() != NULL)
 809       range_check(masm, rax, r11,
 810                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 811                   L_ok);
 812     const char* msg = "i2c adapter must return to an interpreter frame";
 813     __ block_comment(msg);
 814     __ stop(msg);
 815     __ bind(L_ok);
 816     __ block_comment("} verify_i2ce ");
 817   }
 818 
 819   // Must preserve original SP for loading incoming arguments because
 820   // we need to align the outgoing SP for compiled code.
 821   __ movptr(r11, rsp);
 822 
 823   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
 824   // in registers, we will occasionally have no stack args.
 825   int comp_words_on_stack = 0;
 826   if (comp_args_on_stack) {
 827     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
 828     // registers are below.  By subtracting stack0, we either get a negative
 829     // number (all values in registers) or the maximum stack slot accessed.
 830 
 831     // Convert 4-byte c2 stack slots to words.
 832     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 833     // Round up to miminum stack alignment, in wordSize
 834     comp_words_on_stack = align_up(comp_words_on_stack, 2);
 835     __ subptr(rsp, comp_words_on_stack * wordSize);
 836   }
 837 
 838 
 839   // Ensure compiled code always sees stack at proper alignment
 840   __ andptr(rsp, -16);
 841 
 842   // push the return address and misalign the stack that youngest frame always sees
 843   // as far as the placement of the call instruction
 844   __ push(rax);
 845 
 846   // Put saved SP in another register
 847   const Register saved_sp = rax;
 848   __ movptr(saved_sp, r11);
 849 
 850   // Will jump to the compiled code just as if compiled code was doing it.
 851   // Pre-load the register-jump target early, to schedule it better.
 852   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 853 
 854 #if INCLUDE_JVMCI
 855   if (EnableJVMCI) {
 856     // check if this call should be routed towards a specific entry point
 857     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 858     Label no_alternative_target;
 859     __ jcc(Assembler::equal, no_alternative_target);
 860     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 861     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 862     __ bind(no_alternative_target);
 863   }
 864 #endif // INCLUDE_JVMCI
 865 
 866   // Now generate the shuffle code.  Pick up all register args and move the
 867   // rest through the floating point stack top.
 868   for (int i = 0; i < total_args_passed; i++) {
 869     if (sig_bt[i] == T_VOID) {
 870       // Longs and doubles are passed in native word order, but misaligned
 871       // in the 32-bit build.
 872       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 873       continue;
 874     }
 875 
 876     // Pick up 0, 1 or 2 words from SP+offset.
 877 
 878     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 879             "scrambled load targets?");
 880     // Load in argument order going down.
 881     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 882     // Point to interpreter value (vs. tag)
 883     int next_off = ld_off - Interpreter::stackElementSize;
 884     //
 885     //
 886     //
 887     VMReg r_1 = regs[i].first();
 888     VMReg r_2 = regs[i].second();
 889     if (!r_1->is_valid()) {
 890       assert(!r_2->is_valid(), "");
 891       continue;
 892     }
 893     if (r_1->is_stack()) {
 894       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 895       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 896 
 897       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 898       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 899       // will be generated.
 900       if (!r_2->is_valid()) {
 901         // sign extend???
 902         __ movl(r13, Address(saved_sp, ld_off));
 903         __ movptr(Address(rsp, st_off), r13);
 904       } else {
 905         //
 906         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 907         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 908         // So we must adjust where to pick up the data to match the interpreter.
 909         //
 910         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 911         // are accessed as negative so LSW is at LOW address
 912 
 913         // ld_off is MSW so get LSW
 914         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 915                            next_off : ld_off;
 916         __ movq(r13, Address(saved_sp, offset));
 917         // st_off is LSW (i.e. reg.first())
 918         __ movq(Address(rsp, st_off), r13);
 919       }
 920     } else if (r_1->is_Register()) {  // Register argument
 921       Register r = r_1->as_Register();
 922       assert(r != rax, "must be different");
 923       if (r_2->is_valid()) {
 924         //
 925         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 926         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 927         // So we must adjust where to pick up the data to match the interpreter.
 928 
 929         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 930                            next_off : ld_off;
 931 
 932         // this can be a misaligned move
 933         __ movq(r, Address(saved_sp, offset));
 934       } else {
 935         // sign extend and use a full word?
 936         __ movl(r, Address(saved_sp, ld_off));
 937       }
 938     } else {
 939       if (!r_2->is_valid()) {
 940         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 941       } else {
 942         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 943       }
 944     }
 945   }
 946 
 947   // 6243940 We might end up in handle_wrong_method if
 948   // the callee is deoptimized as we race thru here. If that
 949   // happens we don't want to take a safepoint because the
 950   // caller frame will look interpreted and arguments are now
 951   // "compiled" so it is much better to make this transition
 952   // invisible to the stack walking code. Unfortunately if
 953   // we try and find the callee by normal means a safepoint
 954   // is possible. So we stash the desired callee in the thread
 955   // and the vm will find there should this case occur.
 956 
 957   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 958 
 959   // put Method* where a c2i would expect should we end up there
 960   // only needed becaus eof c2 resolve stubs return Method* as a result in
 961   // rax
 962   __ mov(rax, rbx);
 963   __ jmp(r11);
 964 }
 965 
 966 // ---------------------------------------------------------------
 967 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 968                                                             int total_args_passed,
 969                                                             int comp_args_on_stack,
 970                                                             const BasicType *sig_bt,
 971                                                             const VMRegPair *regs,
 972                                                             AdapterFingerPrint* fingerprint) {
 973   address i2c_entry = __ pc();
 974 
 975   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 976 
 977   // -------------------------------------------------------------------------
 978   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 979   // to the interpreter.  The args start out packed in the compiled layout.  They
 980   // need to be unpacked into the interpreter layout.  This will almost always
 981   // require some stack space.  We grow the current (compiled) stack, then repack
 982   // the args.  We  finally end in a jump to the generic interpreter entry point.
 983   // On exit from the interpreter, the interpreter will restore our SP (lest the
 984   // compiled code, which relys solely on SP and not RBP, get sick).
 985 
 986   address c2i_unverified_entry = __ pc();
 987   Label skip_fixup;
 988   Label ok;
 989 
 990   Register holder = rax;
 991   Register receiver = j_rarg0;
 992   Register temp = rbx;
 993 
 994   {
 995     __ load_klass(temp, receiver, rscratch1);
 996     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
 997     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
 998     __ jcc(Assembler::equal, ok);
 999     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1000 
1001     __ bind(ok);
1002     // Method might have been compiled since the call site was patched to
1003     // interpreted if that is the case treat it as a miss so we can get
1004     // the call site corrected.
1005     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1006     __ jcc(Assembler::equal, skip_fixup);
1007     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1008   }
1009 
1010   address c2i_entry = __ pc();
1011 
1012   // Class initialization barrier for static methods
1013   address c2i_no_clinit_check_entry = NULL;
1014   if (VM_Version::supports_fast_class_init_checks()) {
1015     Label L_skip_barrier;
1016     Register method = rbx;
1017 
1018     { // Bypass the barrier for non-static methods
1019       Register flags  = rscratch1;
1020       __ movl(flags, Address(method, Method::access_flags_offset()));
1021       __ testl(flags, JVM_ACC_STATIC);
1022       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1023     }
1024 
1025     Register klass = rscratch1;
1026     __ load_method_holder(klass, method);
1027     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1028 
1029     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1030 
1031     __ bind(L_skip_barrier);
1032     c2i_no_clinit_check_entry = __ pc();
1033   }
1034 
1035   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1036   bs->c2i_entry_barrier(masm);
1037 
1038   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1039 
1040   __ flush();
1041   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1042 }
1043 
1044 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1045                                          VMRegPair *regs,
1046                                          VMRegPair *regs2,
1047                                          int total_args_passed) {
1048   assert(regs2 == NULL, "not needed on x86");
1049 // We return the amount of VMRegImpl stack slots we need to reserve for all
1050 // the arguments NOT counting out_preserve_stack_slots.
1051 
1052 // NOTE: These arrays will have to change when c1 is ported
1053 #ifdef _WIN64
1054     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1055       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1056     };
1057     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1058       c_farg0, c_farg1, c_farg2, c_farg3
1059     };
1060 #else
1061     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1062       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1063     };
1064     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1065       c_farg0, c_farg1, c_farg2, c_farg3,
1066       c_farg4, c_farg5, c_farg6, c_farg7
1067     };
1068 #endif // _WIN64
1069 
1070 
1071     uint int_args = 0;
1072     uint fp_args = 0;
1073     uint stk_args = 0; // inc by 2 each time
1074 
1075     for (int i = 0; i < total_args_passed; i++) {
1076       switch (sig_bt[i]) {
1077       case T_BOOLEAN:
1078       case T_CHAR:
1079       case T_BYTE:
1080       case T_SHORT:
1081       case T_INT:
1082         if (int_args < Argument::n_int_register_parameters_c) {
1083           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1084 #ifdef _WIN64
1085           fp_args++;
1086           // Allocate slots for callee to stuff register args the stack.
1087           stk_args += 2;
1088 #endif
1089         } else {
1090           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1091           stk_args += 2;
1092         }
1093         break;
1094       case T_LONG:
1095         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1096         // fall through
1097       case T_OBJECT:
1098       case T_ARRAY:
1099       case T_ADDRESS:
1100       case T_METADATA:
1101         if (int_args < Argument::n_int_register_parameters_c) {
1102           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1103 #ifdef _WIN64
1104           fp_args++;
1105           stk_args += 2;
1106 #endif
1107         } else {
1108           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1109           stk_args += 2;
1110         }
1111         break;
1112       case T_FLOAT:
1113         if (fp_args < Argument::n_float_register_parameters_c) {
1114           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1115 #ifdef _WIN64
1116           int_args++;
1117           // Allocate slots for callee to stuff register args the stack.
1118           stk_args += 2;
1119 #endif
1120         } else {
1121           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1122           stk_args += 2;
1123         }
1124         break;
1125       case T_DOUBLE:
1126         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1127         if (fp_args < Argument::n_float_register_parameters_c) {
1128           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1129 #ifdef _WIN64
1130           int_args++;
1131           // Allocate slots for callee to stuff register args the stack.
1132           stk_args += 2;
1133 #endif
1134         } else {
1135           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1136           stk_args += 2;
1137         }
1138         break;
1139       case T_VOID: // Halves of longs and doubles
1140         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1141         regs[i].set_bad();
1142         break;
1143       default:
1144         ShouldNotReachHere();
1145         break;
1146       }
1147     }
1148 #ifdef _WIN64
1149   // windows abi requires that we always allocate enough stack space
1150   // for 4 64bit registers to be stored down.
1151   if (stk_args < 8) {
1152     stk_args = 8;
1153   }
1154 #endif // _WIN64
1155 
1156   return stk_args;
1157 }
1158 
1159 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1160                                              uint num_bits,
1161                                              uint total_args_passed) {
1162   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1163          "only certain vector sizes are supported for now");
1164 
1165   static const XMMRegister VEC_ArgReg[32] = {
1166      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1167      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1168     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1169     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1170   };
1171 
1172   uint stk_args = 0;
1173   uint fp_args = 0;
1174 
1175   for (uint i = 0; i < total_args_passed; i++) {
1176     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1177     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1178     regs[i].set_pair(vmreg->next(next_val), vmreg);
1179   }
1180 
1181   return stk_args;
1182 }
1183 
1184 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1185   // We always ignore the frame_slots arg and just use the space just below frame pointer
1186   // which by this time is free to use
1187   switch (ret_type) {
1188   case T_FLOAT:
1189     __ movflt(Address(rbp, -wordSize), xmm0);
1190     break;
1191   case T_DOUBLE:
1192     __ movdbl(Address(rbp, -wordSize), xmm0);
1193     break;
1194   case T_VOID:  break;
1195   default: {
1196     __ movptr(Address(rbp, -wordSize), rax);
1197     }
1198   }
1199 }
1200 
1201 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1202   // We always ignore the frame_slots arg and just use the space just below frame pointer
1203   // which by this time is free to use
1204   switch (ret_type) {
1205   case T_FLOAT:
1206     __ movflt(xmm0, Address(rbp, -wordSize));
1207     break;
1208   case T_DOUBLE:
1209     __ movdbl(xmm0, Address(rbp, -wordSize));
1210     break;
1211   case T_VOID:  break;
1212   default: {
1213     __ movptr(rax, Address(rbp, -wordSize));
1214     }
1215   }
1216 }
1217 
1218 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1219     for ( int i = first_arg ; i < arg_count ; i++ ) {
1220       if (args[i].first()->is_Register()) {
1221         __ push(args[i].first()->as_Register());
1222       } else if (args[i].first()->is_XMMRegister()) {
1223         __ subptr(rsp, 2*wordSize);
1224         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1225       }
1226     }
1227 }
1228 
1229 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1230     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1231       if (args[i].first()->is_Register()) {
1232         __ pop(args[i].first()->as_Register());
1233       } else if (args[i].first()->is_XMMRegister()) {
1234         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1235         __ addptr(rsp, 2*wordSize);
1236       }
1237     }
1238 }
1239 
1240 // Different signatures may require very different orders for the move
1241 // to avoid clobbering other arguments.  There's no simple way to
1242 // order them safely.  Compute a safe order for issuing stores and
1243 // break any cycles in those stores.  This code is fairly general but
1244 // it's not necessary on the other platforms so we keep it in the
1245 // platform dependent code instead of moving it into a shared file.
1246 // (See bugs 7013347 & 7145024.)
1247 // Note that this code is specific to LP64.
1248 class ComputeMoveOrder: public StackObj {
1249   class MoveOperation: public ResourceObj {
1250     friend class ComputeMoveOrder;
1251    private:
1252     VMRegPair        _src;
1253     VMRegPair        _dst;
1254     int              _src_index;
1255     int              _dst_index;
1256     bool             _processed;
1257     MoveOperation*  _next;
1258     MoveOperation*  _prev;
1259 
1260     static int get_id(VMRegPair r) {
1261       return r.first()->value();
1262     }
1263 
1264    public:
1265     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1266       _src(src)
1267     , _dst(dst)
1268     , _src_index(src_index)
1269     , _dst_index(dst_index)
1270     , _processed(false)
1271     , _next(NULL)
1272     , _prev(NULL) {
1273     }
1274 
1275     VMRegPair src() const              { return _src; }
1276     int src_id() const                 { return get_id(src()); }
1277     int src_index() const              { return _src_index; }
1278     VMRegPair dst() const              { return _dst; }
1279     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1280     int dst_index() const              { return _dst_index; }
1281     int dst_id() const                 { return get_id(dst()); }
1282     MoveOperation* next() const       { return _next; }
1283     MoveOperation* prev() const       { return _prev; }
1284     void set_processed()               { _processed = true; }
1285     bool is_processed() const          { return _processed; }
1286 
1287     // insert
1288     void break_cycle(VMRegPair temp_register) {
1289       // create a new store following the last store
1290       // to move from the temp_register to the original
1291       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1292 
1293       // break the cycle of links and insert new_store at the end
1294       // break the reverse link.
1295       MoveOperation* p = prev();
1296       assert(p->next() == this, "must be");
1297       _prev = NULL;
1298       p->_next = new_store;
1299       new_store->_prev = p;
1300 
1301       // change the original store to save it's value in the temp.
1302       set_dst(-1, temp_register);
1303     }
1304 
1305     void link(GrowableArray<MoveOperation*>& killer) {
1306       // link this store in front the store that it depends on
1307       MoveOperation* n = killer.at_grow(src_id(), NULL);
1308       if (n != NULL) {
1309         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1310         _next = n;
1311         n->_prev = this;
1312       }
1313     }
1314   };
1315 
1316  private:
1317   GrowableArray<MoveOperation*> edges;
1318 
1319  public:
1320   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1321                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1322     // Move operations where the dest is the stack can all be
1323     // scheduled first since they can't interfere with the other moves.
1324     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1325       if (in_sig_bt[i] == T_ARRAY) {
1326         c_arg--;
1327         if (out_regs[c_arg].first()->is_stack() &&
1328             out_regs[c_arg + 1].first()->is_stack()) {
1329           arg_order.push(i);
1330           arg_order.push(c_arg);
1331         } else {
1332           if (out_regs[c_arg].first()->is_stack() ||
1333               in_regs[i].first() == out_regs[c_arg].first()) {
1334             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1335           } else {
1336             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1337           }
1338         }
1339       } else if (in_sig_bt[i] == T_VOID) {
1340         arg_order.push(i);
1341         arg_order.push(c_arg);
1342       } else {
1343         if (out_regs[c_arg].first()->is_stack() ||
1344             in_regs[i].first() == out_regs[c_arg].first()) {
1345           arg_order.push(i);
1346           arg_order.push(c_arg);
1347         } else {
1348           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1349         }
1350       }
1351     }
1352     // Break any cycles in the register moves and emit the in the
1353     // proper order.
1354     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1355     for (int i = 0; i < stores->length(); i++) {
1356       arg_order.push(stores->at(i)->src_index());
1357       arg_order.push(stores->at(i)->dst_index());
1358     }
1359  }
1360 
1361   // Collected all the move operations
1362   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1363     if (src.first() == dst.first()) return;
1364     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1365   }
1366 
1367   // Walk the edges breaking cycles between moves.  The result list
1368   // can be walked in order to produce the proper set of loads
1369   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1370     // Record which moves kill which values
1371     GrowableArray<MoveOperation*> killer;
1372     for (int i = 0; i < edges.length(); i++) {
1373       MoveOperation* s = edges.at(i);
1374       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1375       killer.at_put_grow(s->dst_id(), s, NULL);
1376     }
1377     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1378            "make sure temp isn't in the registers that are killed");
1379 
1380     // create links between loads and stores
1381     for (int i = 0; i < edges.length(); i++) {
1382       edges.at(i)->link(killer);
1383     }
1384 
1385     // at this point, all the move operations are chained together
1386     // in a doubly linked list.  Processing it backwards finds
1387     // the beginning of the chain, forwards finds the end.  If there's
1388     // a cycle it can be broken at any point,  so pick an edge and walk
1389     // backward until the list ends or we end where we started.
1390     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1391     for (int e = 0; e < edges.length(); e++) {
1392       MoveOperation* s = edges.at(e);
1393       if (!s->is_processed()) {
1394         MoveOperation* start = s;
1395         // search for the beginning of the chain or cycle
1396         while (start->prev() != NULL && start->prev() != s) {
1397           start = start->prev();
1398         }
1399         if (start->prev() == s) {
1400           start->break_cycle(temp_register);
1401         }
1402         // walk the chain forward inserting to store list
1403         while (start != NULL) {
1404           stores->append(start);
1405           start->set_processed();
1406           start = start->next();
1407         }
1408       }
1409     }
1410     return stores;
1411   }
1412 };
1413 
1414 static void verify_oop_args(MacroAssembler* masm,
1415                             const methodHandle& method,
1416                             const BasicType* sig_bt,
1417                             const VMRegPair* regs) {
1418   Register temp_reg = rbx;  // not part of any compiled calling seq
1419   if (VerifyOops) {
1420     for (int i = 0; i < method->size_of_parameters(); i++) {
1421       if (is_reference_type(sig_bt[i])) {
1422         VMReg r = regs[i].first();
1423         assert(r->is_valid(), "bad oop arg");
1424         if (r->is_stack()) {
1425           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1426           __ verify_oop(temp_reg);
1427         } else {
1428           __ verify_oop(r->as_Register());
1429         }
1430       }
1431     }
1432   }
1433 }
1434 
1435 static void gen_special_dispatch(MacroAssembler* masm,
1436                                  const methodHandle& method,
1437                                  const BasicType* sig_bt,
1438                                  const VMRegPair* regs) {
1439   verify_oop_args(masm, method, sig_bt, regs);
1440   vmIntrinsics::ID iid = method->intrinsic_id();
1441 
1442   // Now write the args into the outgoing interpreter space
1443   bool     has_receiver   = false;
1444   Register receiver_reg   = noreg;
1445   int      member_arg_pos = -1;
1446   Register member_reg     = noreg;
1447   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1448   if (ref_kind != 0) {
1449     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1450     member_reg = rbx;  // known to be free at this point
1451     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1452   } else if (iid == vmIntrinsics::_invokeBasic) {
1453     has_receiver = true;
1454   } else if (iid == vmIntrinsics::_linkToNative) {
1455     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1456     member_reg = rbx;  // known to be free at this point
1457   } else {
1458     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1459   }
1460 
1461   if (member_reg != noreg) {
1462     // Load the member_arg into register, if necessary.
1463     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1464     VMReg r = regs[member_arg_pos].first();
1465     if (r->is_stack()) {
1466       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1467     } else {
1468       // no data motion is needed
1469       member_reg = r->as_Register();
1470     }
1471   }
1472 
1473   if (has_receiver) {
1474     // Make sure the receiver is loaded into a register.
1475     assert(method->size_of_parameters() > 0, "oob");
1476     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1477     VMReg r = regs[0].first();
1478     assert(r->is_valid(), "bad receiver arg");
1479     if (r->is_stack()) {
1480       // Porting note:  This assumes that compiled calling conventions always
1481       // pass the receiver oop in a register.  If this is not true on some
1482       // platform, pick a temp and load the receiver from stack.
1483       fatal("receiver always in a register");
1484       receiver_reg = j_rarg0;  // known to be free at this point
1485       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1486     } else {
1487       // no data motion is needed
1488       receiver_reg = r->as_Register();
1489     }
1490   }
1491 
1492   // Figure out which address we are really jumping to:
1493   MethodHandles::generate_method_handle_dispatch(masm, iid,
1494                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1495 }
1496 
1497 // ---------------------------------------------------------------------------
1498 // Generate a native wrapper for a given method.  The method takes arguments
1499 // in the Java compiled code convention, marshals them to the native
1500 // convention (handlizes oops, etc), transitions to native, makes the call,
1501 // returns to java state (possibly blocking), unhandlizes any result and
1502 // returns.
1503 //
1504 // Critical native functions are a shorthand for the use of
1505 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1506 // functions.  The wrapper is expected to unpack the arguments before
1507 // passing them to the callee. Critical native functions leave the state _in_Java,
1508 // since they cannot stop for GC.
1509 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1510 // block and the check for pending exceptions it's impossible for them
1511 // to be thrown.
1512 //
1513 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1514                                                 const methodHandle& method,
1515                                                 int compile_id,
1516                                                 BasicType* in_sig_bt,
1517                                                 VMRegPair* in_regs,
1518                                                 BasicType ret_type) {
1519   if (method->is_method_handle_intrinsic()) {
1520     vmIntrinsics::ID iid = method->intrinsic_id();
1521     intptr_t start = (intptr_t)__ pc();
1522     int vep_offset = ((intptr_t)__ pc()) - start;
1523     gen_special_dispatch(masm,
1524                          method,
1525                          in_sig_bt,
1526                          in_regs);
1527     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1528     __ flush();
1529     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1530     return nmethod::new_native_nmethod(method,
1531                                        compile_id,
1532                                        masm->code(),
1533                                        vep_offset,
1534                                        frame_complete,
1535                                        stack_slots / VMRegImpl::slots_per_word,
1536                                        in_ByteSize(-1),
1537                                        in_ByteSize(-1),
1538                                        (OopMapSet*)NULL);
1539   }
1540   address native_func = method->native_function();
1541   assert(native_func != NULL, "must have function");
1542 
1543   // An OopMap for lock (and class if static)
1544   OopMapSet *oop_maps = new OopMapSet();
1545   intptr_t start = (intptr_t)__ pc();
1546 
1547   // We have received a description of where all the java arg are located
1548   // on entry to the wrapper. We need to convert these args to where
1549   // the jni function will expect them. To figure out where they go
1550   // we convert the java signature to a C signature by inserting
1551   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1552 
1553   const int total_in_args = method->size_of_parameters();
1554   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1555 
1556   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1557   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1558   BasicType* in_elem_bt = NULL;
1559 
1560   int argc = 0;
1561   out_sig_bt[argc++] = T_ADDRESS;
1562   if (method->is_static()) {
1563     out_sig_bt[argc++] = T_OBJECT;
1564   }
1565 
1566   for (int i = 0; i < total_in_args ; i++ ) {
1567     out_sig_bt[argc++] = in_sig_bt[i];
1568   }
1569 
1570   // Now figure out where the args must be stored and how much stack space
1571   // they require.
1572   int out_arg_slots;
1573   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1574 
1575   // Compute framesize for the wrapper.  We need to handlize all oops in
1576   // incoming registers
1577 
1578   // Calculate the total number of stack slots we will need.
1579 
1580   // First count the abi requirement plus all of the outgoing args
1581   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1582 
1583   // Now the space for the inbound oop handle area
1584   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1585 
1586   int oop_handle_offset = stack_slots;
1587   stack_slots += total_save_slots;
1588 
1589   // Now any space we need for handlizing a klass if static method
1590 
1591   int klass_slot_offset = 0;
1592   int klass_offset = -1;
1593   int lock_slot_offset = 0;
1594   bool is_static = false;
1595 
1596   if (method->is_static()) {
1597     klass_slot_offset = stack_slots;
1598     stack_slots += VMRegImpl::slots_per_word;
1599     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1600     is_static = true;
1601   }
1602 
1603   // Plus a lock if needed
1604 
1605   if (method->is_synchronized()) {
1606     lock_slot_offset = stack_slots;
1607     stack_slots += VMRegImpl::slots_per_word;
1608   }
1609 
1610   // Now a place (+2) to save return values or temp during shuffling
1611   // + 4 for return address (which we own) and saved rbp
1612   stack_slots += 6;
1613 
1614   // Ok The space we have allocated will look like:
1615   //
1616   //
1617   // FP-> |                     |
1618   //      |---------------------|
1619   //      | 2 slots for moves   |
1620   //      |---------------------|
1621   //      | lock box (if sync)  |
1622   //      |---------------------| <- lock_slot_offset
1623   //      | klass (if static)   |
1624   //      |---------------------| <- klass_slot_offset
1625   //      | oopHandle area      |
1626   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1627   //      | outbound memory     |
1628   //      | based arguments     |
1629   //      |                     |
1630   //      |---------------------|
1631   //      |                     |
1632   // SP-> | out_preserved_slots |
1633   //
1634   //
1635 
1636 
1637   // Now compute actual number of stack words we need rounding to make
1638   // stack properly aligned.
1639   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1640 
1641   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1642 
1643   // First thing make an ic check to see if we should even be here
1644 
1645   // We are free to use all registers as temps without saving them and
1646   // restoring them except rbp. rbp is the only callee save register
1647   // as far as the interpreter and the compiler(s) are concerned.
1648 
1649 
1650   const Register ic_reg = rax;
1651   const Register receiver = j_rarg0;
1652 
1653   Label hit;
1654   Label exception_pending;
1655 
1656   assert_different_registers(ic_reg, receiver, rscratch1);
1657   __ verify_oop(receiver);
1658   __ load_klass(rscratch1, receiver, rscratch2);
1659   __ cmpq(ic_reg, rscratch1);
1660   __ jcc(Assembler::equal, hit);
1661 
1662   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1663 
1664   // Verified entry point must be aligned
1665   __ align(8);
1666 
1667   __ bind(hit);
1668 
1669   int vep_offset = ((intptr_t)__ pc()) - start;
1670 
1671   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1672     Label L_skip_barrier;
1673     Register klass = r10;
1674     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1675     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1676 
1677     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1678 
1679     __ bind(L_skip_barrier);
1680   }
1681 
1682 #ifdef COMPILER1
1683   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1684   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1685     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1686   }
1687 #endif // COMPILER1
1688 
1689   // The instruction at the verified entry point must be 5 bytes or longer
1690   // because it can be patched on the fly by make_non_entrant. The stack bang
1691   // instruction fits that requirement.
1692 
1693   // Generate stack overflow check
1694   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1695 
1696   // Generate a new frame for the wrapper.
1697   __ enter();
1698   // -2 because return address is already present and so is saved rbp
1699   __ subptr(rsp, stack_size - 2*wordSize);
1700 
1701   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1702   bs->nmethod_entry_barrier(masm);
1703 
1704   // Frame is now completed as far as size and linkage.
1705   int frame_complete = ((intptr_t)__ pc()) - start;
1706 
1707     if (UseRTMLocking) {
1708       // Abort RTM transaction before calling JNI
1709       // because critical section will be large and will be
1710       // aborted anyway. Also nmethod could be deoptimized.
1711       __ xabort(0);
1712     }
1713 
1714 #ifdef ASSERT
1715     {
1716       Label L;
1717       __ mov(rax, rsp);
1718       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1719       __ cmpptr(rax, rsp);
1720       __ jcc(Assembler::equal, L);
1721       __ stop("improperly aligned stack");
1722       __ bind(L);
1723     }
1724 #endif /* ASSERT */
1725 
1726 
1727   // We use r14 as the oop handle for the receiver/klass
1728   // It is callee save so it survives the call to native
1729 
1730   const Register oop_handle_reg = r14;
1731 
1732   //
1733   // We immediately shuffle the arguments so that any vm call we have to
1734   // make from here on out (sync slow path, jvmti, etc.) we will have
1735   // captured the oops from our caller and have a valid oopMap for
1736   // them.
1737 
1738   // -----------------
1739   // The Grand Shuffle
1740 
1741   // The Java calling convention is either equal (linux) or denser (win64) than the
1742   // c calling convention. However the because of the jni_env argument the c calling
1743   // convention always has at least one more (and two for static) arguments than Java.
1744   // Therefore if we move the args from java -> c backwards then we will never have
1745   // a register->register conflict and we don't have to build a dependency graph
1746   // and figure out how to break any cycles.
1747   //
1748 
1749   // Record esp-based slot for receiver on stack for non-static methods
1750   int receiver_offset = -1;
1751 
1752   // This is a trick. We double the stack slots so we can claim
1753   // the oops in the caller's frame. Since we are sure to have
1754   // more args than the caller doubling is enough to make
1755   // sure we can capture all the incoming oop args from the
1756   // caller.
1757   //
1758   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1759 
1760   // Mark location of rbp (someday)
1761   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1762 
1763   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1764   // All inbound args are referenced based on rbp and all outbound args via rsp.
1765 
1766 
1767 #ifdef ASSERT
1768   bool reg_destroyed[RegisterImpl::number_of_registers];
1769   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1770   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1771     reg_destroyed[r] = false;
1772   }
1773   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1774     freg_destroyed[f] = false;
1775   }
1776 
1777 #endif /* ASSERT */
1778 
1779   // For JNI natives the incoming and outgoing registers are offset upwards.
1780   GrowableArray<int> arg_order(2 * total_in_args);
1781 
1782   VMRegPair tmp_vmreg;
1783   tmp_vmreg.set2(rbx->as_VMReg());
1784 
1785   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1786     arg_order.push(i);
1787     arg_order.push(c_arg);
1788   }
1789 
1790   int temploc = -1;
1791   for (int ai = 0; ai < arg_order.length(); ai += 2) {
1792     int i = arg_order.at(ai);
1793     int c_arg = arg_order.at(ai + 1);
1794     __ block_comment(err_msg("move %d -> %d", i, c_arg));
1795 #ifdef ASSERT
1796     if (in_regs[i].first()->is_Register()) {
1797       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1798     } else if (in_regs[i].first()->is_XMMRegister()) {
1799       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1800     }
1801     if (out_regs[c_arg].first()->is_Register()) {
1802       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1803     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1804       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1805     }
1806 #endif /* ASSERT */
1807     switch (in_sig_bt[i]) {
1808       case T_ARRAY:
1809       case T_OBJECT:
1810         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1811                     ((i == 0) && (!is_static)),
1812                     &receiver_offset);
1813         break;
1814       case T_VOID:
1815         break;
1816 
1817       case T_FLOAT:
1818         __ float_move(in_regs[i], out_regs[c_arg]);
1819           break;
1820 
1821       case T_DOUBLE:
1822         assert( i + 1 < total_in_args &&
1823                 in_sig_bt[i + 1] == T_VOID &&
1824                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1825         __ double_move(in_regs[i], out_regs[c_arg]);
1826         break;
1827 
1828       case T_LONG :
1829         __ long_move(in_regs[i], out_regs[c_arg]);
1830         break;
1831 
1832       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1833 
1834       default:
1835         __ move32_64(in_regs[i], out_regs[c_arg]);
1836     }
1837   }
1838 
1839   int c_arg;
1840 
1841   // Pre-load a static method's oop into r14.  Used both by locking code and
1842   // the normal JNI call code.
1843   // point c_arg at the first arg that is already loaded in case we
1844   // need to spill before we call out
1845   c_arg = total_c_args - total_in_args;
1846 
1847   if (method->is_static()) {
1848 
1849     //  load oop into a register
1850     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1851 
1852     // Now handlize the static class mirror it's known not-null.
1853     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1854     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1855 
1856     // Now get the handle
1857     __ lea(oop_handle_reg, Address(rsp, klass_offset));
1858     // store the klass handle as second argument
1859     __ movptr(c_rarg1, oop_handle_reg);
1860     // and protect the arg if we must spill
1861     c_arg--;
1862   }
1863 
1864   // Change state to native (we save the return address in the thread, since it might not
1865   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
1866   // points into the right code segment. It does not have to be the correct return pc.
1867   // We use the same pc/oopMap repeatedly when we call out
1868 
1869   intptr_t the_pc = (intptr_t) __ pc();
1870   oop_maps->add_gc_map(the_pc - start, map);
1871 
1872   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
1873 
1874 
1875   // We have all of the arguments setup at this point. We must not touch any register
1876   // argument registers at this point (what if we save/restore them there are no oop?
1877 
1878   {
1879     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
1880     // protect the args we've loaded
1881     save_args(masm, total_c_args, c_arg, out_regs);
1882     __ mov_metadata(c_rarg1, method());
1883     __ call_VM_leaf(
1884       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
1885       r15_thread, c_rarg1);
1886     restore_args(masm, total_c_args, c_arg, out_regs);
1887   }
1888 
1889   // RedefineClasses() tracing support for obsolete method entry
1890   if (log_is_enabled(Trace, redefine, class, obsolete)) {
1891     // protect the args we've loaded
1892     save_args(masm, total_c_args, c_arg, out_regs);
1893     __ mov_metadata(c_rarg1, method());
1894     __ call_VM_leaf(
1895       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
1896       r15_thread, c_rarg1);
1897     restore_args(masm, total_c_args, c_arg, out_regs);
1898   }
1899 
1900   // Lock a synchronized method
1901 
1902   // Register definitions used by locking and unlocking
1903 
1904   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
1905   const Register obj_reg  = rbx;  // Will contain the oop
1906   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
1907   const Register old_hdr  = r13;  // value of old header at unlock time
1908 
1909   Label slow_path_lock;
1910   Label lock_done;
1911 
1912   if (method->is_synchronized()) {
1913 
1914     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
1915 
1916     // Get the handle (the 2nd argument)
1917     __ mov(oop_handle_reg, c_rarg1);
1918 
1919     // Get address of the box
1920 
1921     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
1922 
1923     // Load the oop from the handle
1924     __ movptr(obj_reg, Address(oop_handle_reg, 0));
1925 
1926     // Load immediate 1 into swap_reg %rax
1927     __ movl(swap_reg, 1);
1928 
1929     // Load (object->mark() | 1) into swap_reg %rax
1930     __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1931 
1932     // Save (object->mark() | 1) into BasicLock's displaced header
1933     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
1934 
1935     // src -> dest iff dest == rax else rax <- dest
1936     __ lock();
1937     __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1938     __ jcc(Assembler::equal, lock_done);
1939 
1940     // Hmm should this move to the slow path code area???
1941 
1942     // Test if the oopMark is an obvious stack pointer, i.e.,
1943     //  1) (mark & 3) == 0, and
1944     //  2) rsp <= mark < mark + os::pagesize()
1945     // These 3 tests can be done by evaluating the following
1946     // expression: ((mark - rsp) & (3 - os::vm_page_size())),
1947     // assuming both stack pointer and pagesize have their
1948     // least significant 2 bits clear.
1949     // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
1950 
1951     __ subptr(swap_reg, rsp);
1952     __ andptr(swap_reg, 3 - os::vm_page_size());
1953 
1954     // Save the test result, for recursive case, the result is zero
1955     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
1956     __ jcc(Assembler::notEqual, slow_path_lock);
1957 
1958     // Slow path will re-enter here
1959 
1960     __ bind(lock_done);
1961   }
1962 
1963   // Finally just about ready to make the JNI call
1964 
1965   // get JNIEnv* which is first argument to native
1966   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
1967 
1968   // Now set thread in native
1969   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
1970 
1971   __ call(RuntimeAddress(native_func));
1972 
1973   // Verify or restore cpu control state after JNI call
1974   __ restore_cpu_control_state_after_jni();
1975 
1976   // Unpack native results.
1977   switch (ret_type) {
1978   case T_BOOLEAN: __ c2bool(rax);            break;
1979   case T_CHAR   : __ movzwl(rax, rax);      break;
1980   case T_BYTE   : __ sign_extend_byte (rax); break;
1981   case T_SHORT  : __ sign_extend_short(rax); break;
1982   case T_INT    : /* nothing to do */        break;
1983   case T_DOUBLE :
1984   case T_FLOAT  :
1985     // Result is in xmm0 we'll save as needed
1986     break;
1987   case T_ARRAY:                 // Really a handle
1988   case T_OBJECT:                // Really a handle
1989       break; // can't de-handlize until after safepoint check
1990   case T_VOID: break;
1991   case T_LONG: break;
1992   default       : ShouldNotReachHere();
1993   }
1994 
1995   Label after_transition;
1996 
1997   // Switch thread to "native transition" state before reading the synchronization state.
1998   // This additional state is necessary because reading and testing the synchronization
1999   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2000   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2001   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2002   //     Thread A is resumed to finish this native method, but doesn't block here since it
2003   //     didn't see any synchronization is progress, and escapes.
2004   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2005 
2006   // Force this write out before the read below
2007   __ membar(Assembler::Membar_mask_bits(
2008               Assembler::LoadLoad | Assembler::LoadStore |
2009               Assembler::StoreLoad | Assembler::StoreStore));
2010 
2011   // check for safepoint operation in progress and/or pending suspend requests
2012   {
2013     Label Continue;
2014     Label slow_path;
2015 
2016     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2017 
2018     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2019     __ jcc(Assembler::equal, Continue);
2020     __ bind(slow_path);
2021 
2022     // Don't use call_VM as it will see a possible pending exception and forward it
2023     // and never return here preventing us from clearing _last_native_pc down below.
2024     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2025     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2026     // by hand.
2027     //
2028     __ vzeroupper();
2029     save_native_result(masm, ret_type, stack_slots);
2030     __ mov(c_rarg0, r15_thread);
2031     __ mov(r12, rsp); // remember sp
2032     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2033     __ andptr(rsp, -16); // align stack as required by ABI
2034     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2035     __ mov(rsp, r12); // restore sp
2036     __ reinit_heapbase();
2037     // Restore any method result value
2038     restore_native_result(masm, ret_type, stack_slots);
2039     __ bind(Continue);
2040   }
2041 
2042   // change thread state
2043   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2044   __ bind(after_transition);
2045 
2046   Label reguard;
2047   Label reguard_done;
2048   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2049   __ jcc(Assembler::equal, reguard);
2050   __ bind(reguard_done);
2051 
2052   // native result if any is live
2053 
2054   // Unlock
2055   Label unlock_done;
2056   Label slow_path_unlock;
2057   if (method->is_synchronized()) {
2058 
2059     // Get locked oop from the handle we passed to jni
2060     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2061 
2062     Label done;
2063     // Simple recursive lock?
2064 
2065     __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2066     __ jcc(Assembler::equal, done);
2067 
2068     // Must save rax if if it is live now because cmpxchg must use it
2069     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2070       save_native_result(masm, ret_type, stack_slots);
2071     }
2072 
2073 
2074     // get address of the stack lock
2075     __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2076     //  get old displaced header
2077     __ movptr(old_hdr, Address(rax, 0));
2078 
2079     // Atomic swap old header if oop still contains the stack lock
2080     __ lock();
2081     __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2082     __ jcc(Assembler::notEqual, slow_path_unlock);
2083 
2084     // slow path re-enters here
2085     __ bind(unlock_done);
2086     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2087       restore_native_result(masm, ret_type, stack_slots);
2088     }
2089 
2090     __ bind(done);
2091 
2092   }
2093   {
2094     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2095     save_native_result(masm, ret_type, stack_slots);
2096     __ mov_metadata(c_rarg1, method());
2097     __ call_VM_leaf(
2098          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2099          r15_thread, c_rarg1);
2100     restore_native_result(masm, ret_type, stack_slots);
2101   }
2102 
2103   __ reset_last_Java_frame(false);
2104 
2105   // Unbox oop result, e.g. JNIHandles::resolve value.
2106   if (is_reference_type(ret_type)) {
2107     __ resolve_jobject(rax /* value */,
2108                        r15_thread /* thread */,
2109                        rcx /* tmp */);
2110   }
2111 
2112   if (CheckJNICalls) {
2113     // clear_pending_jni_exception_check
2114     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2115   }
2116 
2117   // reset handle block
2118   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2119   __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2120 
2121   // pop our frame
2122 
2123   __ leave();
2124 
2125   // Any exception pending?
2126   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2127   __ jcc(Assembler::notEqual, exception_pending);
2128 
2129   // Return
2130 
2131   __ ret(0);
2132 
2133   // Unexpected paths are out of line and go here
2134 
2135   // forward the exception
2136   __ bind(exception_pending);
2137 
2138   // and forward the exception
2139   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2140 
2141   // Slow path locking & unlocking
2142   if (method->is_synchronized()) {
2143 
2144     // BEGIN Slow path lock
2145     __ bind(slow_path_lock);
2146 
2147     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2148     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2149 
2150     // protect the args we've loaded
2151     save_args(masm, total_c_args, c_arg, out_regs);
2152 
2153     __ mov(c_rarg0, obj_reg);
2154     __ mov(c_rarg1, lock_reg);
2155     __ mov(c_rarg2, r15_thread);
2156 
2157     // Not a leaf but we have last_Java_frame setup as we want
2158     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2159     restore_args(masm, total_c_args, c_arg, out_regs);
2160 
2161 #ifdef ASSERT
2162     { Label L;
2163     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2164     __ jcc(Assembler::equal, L);
2165     __ stop("no pending exception allowed on exit from monitorenter");
2166     __ bind(L);
2167     }
2168 #endif
2169     __ jmp(lock_done);
2170 
2171     // END Slow path lock
2172 
2173     // BEGIN Slow path unlock
2174     __ bind(slow_path_unlock);
2175 
2176     // If we haven't already saved the native result we must save it now as xmm registers
2177     // are still exposed.
2178     __ vzeroupper();
2179     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2180       save_native_result(masm, ret_type, stack_slots);
2181     }
2182 
2183     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2184 
2185     __ mov(c_rarg0, obj_reg);
2186     __ mov(c_rarg2, r15_thread);
2187     __ mov(r12, rsp); // remember sp
2188     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2189     __ andptr(rsp, -16); // align stack as required by ABI
2190 
2191     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2192     // NOTE that obj_reg == rbx currently
2193     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2194     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2195 
2196     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2197     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2198     __ mov(rsp, r12); // restore sp
2199     __ reinit_heapbase();
2200 #ifdef ASSERT
2201     {
2202       Label L;
2203       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2204       __ jcc(Assembler::equal, L);
2205       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2206       __ bind(L);
2207     }
2208 #endif /* ASSERT */
2209 
2210     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2211 
2212     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2213       restore_native_result(masm, ret_type, stack_slots);
2214     }
2215     __ jmp(unlock_done);
2216 
2217     // END Slow path unlock
2218 
2219   } // synchronized
2220 
2221   // SLOW PATH Reguard the stack if needed
2222 
2223   __ bind(reguard);
2224   __ vzeroupper();
2225   save_native_result(masm, ret_type, stack_slots);
2226   __ mov(r12, rsp); // remember sp
2227   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2228   __ andptr(rsp, -16); // align stack as required by ABI
2229   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2230   __ mov(rsp, r12); // restore sp
2231   __ reinit_heapbase();
2232   restore_native_result(masm, ret_type, stack_slots);
2233   // and continue
2234   __ jmp(reguard_done);
2235 
2236 
2237 
2238   __ flush();
2239 
2240   nmethod *nm = nmethod::new_native_nmethod(method,
2241                                             compile_id,
2242                                             masm->code(),
2243                                             vep_offset,
2244                                             frame_complete,
2245                                             stack_slots / VMRegImpl::slots_per_word,
2246                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2247                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2248                                             oop_maps);
2249 
2250   return nm;
2251 }
2252 
2253 // this function returns the adjust size (in number of words) to a c2i adapter
2254 // activation for use during deoptimization
2255 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2256   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2257 }
2258 
2259 
2260 uint SharedRuntime::out_preserve_stack_slots() {
2261   return 0;
2262 }
2263 
2264 
2265 // Number of stack slots between incoming argument block and the start of
2266 // a new frame.  The PROLOG must add this many slots to the stack.  The
2267 // EPILOG must remove this many slots.  amd64 needs two slots for
2268 // return address.
2269 uint SharedRuntime::in_preserve_stack_slots() {
2270   return 4 + 2 * VerifyStackAtCalls;
2271 }
2272 
2273 //------------------------------generate_deopt_blob----------------------------
2274 void SharedRuntime::generate_deopt_blob() {
2275   // Allocate space for the code
2276   ResourceMark rm;
2277   // Setup code generation tools
2278   int pad = 0;
2279   if (UseAVX > 2) {
2280     pad += 1024;
2281   }
2282 #if INCLUDE_JVMCI
2283   if (EnableJVMCI) {
2284     pad += 512; // Increase the buffer size when compiling for JVMCI
2285   }
2286 #endif
2287   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2288   MacroAssembler* masm = new MacroAssembler(&buffer);
2289   int frame_size_in_words;
2290   OopMap* map = NULL;
2291   OopMapSet *oop_maps = new OopMapSet();
2292 
2293   // -------------
2294   // This code enters when returning to a de-optimized nmethod.  A return
2295   // address has been pushed on the the stack, and return values are in
2296   // registers.
2297   // If we are doing a normal deopt then we were called from the patched
2298   // nmethod from the point we returned to the nmethod. So the return
2299   // address on the stack is wrong by NativeCall::instruction_size
2300   // We will adjust the value so it looks like we have the original return
2301   // address on the stack (like when we eagerly deoptimized).
2302   // In the case of an exception pending when deoptimizing, we enter
2303   // with a return address on the stack that points after the call we patched
2304   // into the exception handler. We have the following register state from,
2305   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2306   //    rax: exception oop
2307   //    rbx: exception handler
2308   //    rdx: throwing pc
2309   // So in this case we simply jam rdx into the useless return address and
2310   // the stack looks just like we want.
2311   //
2312   // At this point we need to de-opt.  We save the argument return
2313   // registers.  We call the first C routine, fetch_unroll_info().  This
2314   // routine captures the return values and returns a structure which
2315   // describes the current frame size and the sizes of all replacement frames.
2316   // The current frame is compiled code and may contain many inlined
2317   // functions, each with their own JVM state.  We pop the current frame, then
2318   // push all the new frames.  Then we call the C routine unpack_frames() to
2319   // populate these frames.  Finally unpack_frames() returns us the new target
2320   // address.  Notice that callee-save registers are BLOWN here; they have
2321   // already been captured in the vframeArray at the time the return PC was
2322   // patched.
2323   address start = __ pc();
2324   Label cont;
2325 
2326   // Prolog for non exception case!
2327 
2328   // Save everything in sight.
2329   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2330 
2331   // Normal deoptimization.  Save exec mode for unpack_frames.
2332   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2333   __ jmp(cont);
2334 
2335   int reexecute_offset = __ pc() - start;
2336 #if INCLUDE_JVMCI && !defined(COMPILER1)
2337   if (EnableJVMCI && UseJVMCICompiler) {
2338     // JVMCI does not use this kind of deoptimization
2339     __ should_not_reach_here();
2340   }
2341 #endif
2342 
2343   // Reexecute case
2344   // return address is the pc describes what bci to do re-execute at
2345 
2346   // No need to update map as each call to save_live_registers will produce identical oopmap
2347   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2348 
2349   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2350   __ jmp(cont);
2351 
2352 #if INCLUDE_JVMCI
2353   Label after_fetch_unroll_info_call;
2354   int implicit_exception_uncommon_trap_offset = 0;
2355   int uncommon_trap_offset = 0;
2356 
2357   if (EnableJVMCI) {
2358     implicit_exception_uncommon_trap_offset = __ pc() - start;
2359 
2360     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2361     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2362 
2363     uncommon_trap_offset = __ pc() - start;
2364 
2365     // Save everything in sight.
2366     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2367     // fetch_unroll_info needs to call last_java_frame()
2368     __ set_last_Java_frame(noreg, noreg, NULL);
2369 
2370     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2371     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2372 
2373     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2374     __ mov(c_rarg0, r15_thread);
2375     __ movl(c_rarg2, r14); // exec mode
2376     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2377     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2378 
2379     __ reset_last_Java_frame(false);
2380 
2381     __ jmp(after_fetch_unroll_info_call);
2382   } // EnableJVMCI
2383 #endif // INCLUDE_JVMCI
2384 
2385   int exception_offset = __ pc() - start;
2386 
2387   // Prolog for exception case
2388 
2389   // all registers are dead at this entry point, except for rax, and
2390   // rdx which contain the exception oop and exception pc
2391   // respectively.  Set them in TLS and fall thru to the
2392   // unpack_with_exception_in_tls entry point.
2393 
2394   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2395   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2396 
2397   int exception_in_tls_offset = __ pc() - start;
2398 
2399   // new implementation because exception oop is now passed in JavaThread
2400 
2401   // Prolog for exception case
2402   // All registers must be preserved because they might be used by LinearScan
2403   // Exceptiop oop and throwing PC are passed in JavaThread
2404   // tos: stack at point of call to method that threw the exception (i.e. only
2405   // args are on the stack, no return address)
2406 
2407   // make room on stack for the return address
2408   // It will be patched later with the throwing pc. The correct value is not
2409   // available now because loading it from memory would destroy registers.
2410   __ push(0);
2411 
2412   // Save everything in sight.
2413   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2414 
2415   // Now it is safe to overwrite any register
2416 
2417   // Deopt during an exception.  Save exec mode for unpack_frames.
2418   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2419 
2420   // load throwing pc from JavaThread and patch it as the return address
2421   // of the current frame. Then clear the field in JavaThread
2422 
2423   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2424   __ movptr(Address(rbp, wordSize), rdx);
2425   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2426 
2427 #ifdef ASSERT
2428   // verify that there is really an exception oop in JavaThread
2429   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2430   __ verify_oop(rax);
2431 
2432   // verify that there is no pending exception
2433   Label no_pending_exception;
2434   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2435   __ testptr(rax, rax);
2436   __ jcc(Assembler::zero, no_pending_exception);
2437   __ stop("must not have pending exception here");
2438   __ bind(no_pending_exception);
2439 #endif
2440 
2441   __ bind(cont);
2442 
2443   // Call C code.  Need thread and this frame, but NOT official VM entry
2444   // crud.  We cannot block on this call, no GC can happen.
2445   //
2446   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2447 
2448   // fetch_unroll_info needs to call last_java_frame().
2449 
2450   __ set_last_Java_frame(noreg, noreg, NULL);
2451 #ifdef ASSERT
2452   { Label L;
2453     __ cmpptr(Address(r15_thread,
2454                     JavaThread::last_Java_fp_offset()),
2455             (int32_t)0);
2456     __ jcc(Assembler::equal, L);
2457     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2458     __ bind(L);
2459   }
2460 #endif // ASSERT
2461   __ mov(c_rarg0, r15_thread);
2462   __ movl(c_rarg1, r14); // exec_mode
2463   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2464 
2465   // Need to have an oopmap that tells fetch_unroll_info where to
2466   // find any register it might need.
2467   oop_maps->add_gc_map(__ pc() - start, map);
2468 
2469   __ reset_last_Java_frame(false);
2470 
2471 #if INCLUDE_JVMCI
2472   if (EnableJVMCI) {
2473     __ bind(after_fetch_unroll_info_call);
2474   }
2475 #endif
2476 
2477   // Load UnrollBlock* into rdi
2478   __ mov(rdi, rax);
2479 
2480   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2481    Label noException;
2482   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2483   __ jcc(Assembler::notEqual, noException);
2484   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2485   // QQQ this is useless it was NULL above
2486   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2487   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2488   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2489 
2490   __ verify_oop(rax);
2491 
2492   // Overwrite the result registers with the exception results.
2493   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2494   // I think this is useless
2495   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2496 
2497   __ bind(noException);
2498 
2499   // Only register save data is on the stack.
2500   // Now restore the result registers.  Everything else is either dead
2501   // or captured in the vframeArray.
2502   RegisterSaver::restore_result_registers(masm);
2503 
2504   // All of the register save area has been popped of the stack. Only the
2505   // return address remains.
2506 
2507   // Pop all the frames we must move/replace.
2508   //
2509   // Frame picture (youngest to oldest)
2510   // 1: self-frame (no frame link)
2511   // 2: deopting frame  (no frame link)
2512   // 3: caller of deopting frame (could be compiled/interpreted).
2513   //
2514   // Note: by leaving the return address of self-frame on the stack
2515   // and using the size of frame 2 to adjust the stack
2516   // when we are done the return to frame 3 will still be on the stack.
2517 
2518   // Pop deoptimized frame
2519   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2520   __ addptr(rsp, rcx);
2521 
2522   // rsp should be pointing at the return address to the caller (3)
2523 
2524   // Pick up the initial fp we should save
2525   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2526   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2527 
2528 #ifdef ASSERT
2529   // Compilers generate code that bang the stack by as much as the
2530   // interpreter would need. So this stack banging should never
2531   // trigger a fault. Verify that it does not on non product builds.
2532   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2533   __ bang_stack_size(rbx, rcx);
2534 #endif
2535 
2536   // Load address of array of frame pcs into rcx
2537   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2538 
2539   // Trash the old pc
2540   __ addptr(rsp, wordSize);
2541 
2542   // Load address of array of frame sizes into rsi
2543   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2544 
2545   // Load counter into rdx
2546   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2547 
2548   // Now adjust the caller's stack to make up for the extra locals
2549   // but record the original sp so that we can save it in the skeletal interpreter
2550   // frame and the stack walking of interpreter_sender will get the unextended sp
2551   // value and not the "real" sp value.
2552 
2553   const Register sender_sp = r8;
2554 
2555   __ mov(sender_sp, rsp);
2556   __ movl(rbx, Address(rdi,
2557                        Deoptimization::UnrollBlock::
2558                        caller_adjustment_offset_in_bytes()));
2559   __ subptr(rsp, rbx);
2560 
2561   // Push interpreter frames in a loop
2562   Label loop;
2563   __ bind(loop);
2564   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2565   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2566   __ pushptr(Address(rcx, 0));          // Save return address
2567   __ enter();                           // Save old & set new ebp
2568   __ subptr(rsp, rbx);                  // Prolog
2569   // This value is corrected by layout_activation_impl
2570   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2571   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2572   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2573   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2574   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2575   __ decrementl(rdx);                   // Decrement counter
2576   __ jcc(Assembler::notZero, loop);
2577   __ pushptr(Address(rcx, 0));          // Save final return address
2578 
2579   // Re-push self-frame
2580   __ enter();                           // Save old & set new ebp
2581 
2582   // Allocate a full sized register save area.
2583   // Return address and rbp are in place, so we allocate two less words.
2584   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2585 
2586   // Restore frame locals after moving the frame
2587   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2588   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2589 
2590   // Call C code.  Need thread but NOT official VM entry
2591   // crud.  We cannot block on this call, no GC can happen.  Call should
2592   // restore return values to their stack-slots with the new SP.
2593   //
2594   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2595 
2596   // Use rbp because the frames look interpreted now
2597   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2598   // Don't need the precise return PC here, just precise enough to point into this code blob.
2599   address the_pc = __ pc();
2600   __ set_last_Java_frame(noreg, rbp, the_pc);
2601 
2602   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2603   __ mov(c_rarg0, r15_thread);
2604   __ movl(c_rarg1, r14); // second arg: exec_mode
2605   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2606   // Revert SP alignment after call since we're going to do some SP relative addressing below
2607   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2608 
2609   // Set an oopmap for the call site
2610   // Use the same PC we used for the last java frame
2611   oop_maps->add_gc_map(the_pc - start,
2612                        new OopMap( frame_size_in_words, 0 ));
2613 
2614   // Clear fp AND pc
2615   __ reset_last_Java_frame(true);
2616 
2617   // Collect return values
2618   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2619   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2620   // I think this is useless (throwing pc?)
2621   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2622 
2623   // Pop self-frame.
2624   __ leave();                           // Epilog
2625 
2626   // Jump to interpreter
2627   __ ret(0);
2628 
2629   // Make sure all code is generated
2630   masm->flush();
2631 
2632   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2633   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2634 #if INCLUDE_JVMCI
2635   if (EnableJVMCI) {
2636     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2637     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2638   }
2639 #endif
2640 }
2641 
2642 #ifdef COMPILER2
2643 //------------------------------generate_uncommon_trap_blob--------------------
2644 void SharedRuntime::generate_uncommon_trap_blob() {
2645   // Allocate space for the code
2646   ResourceMark rm;
2647   // Setup code generation tools
2648   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2649   MacroAssembler* masm = new MacroAssembler(&buffer);
2650 
2651   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2652 
2653   address start = __ pc();
2654 
2655   if (UseRTMLocking) {
2656     // Abort RTM transaction before possible nmethod deoptimization.
2657     __ xabort(0);
2658   }
2659 
2660   // Push self-frame.  We get here with a return address on the
2661   // stack, so rsp is 8-byte aligned until we allocate our frame.
2662   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2663 
2664   // No callee saved registers. rbp is assumed implicitly saved
2665   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2666 
2667   // compiler left unloaded_class_index in j_rarg0 move to where the
2668   // runtime expects it.
2669   __ movl(c_rarg1, j_rarg0);
2670 
2671   __ set_last_Java_frame(noreg, noreg, NULL);
2672 
2673   // Call C code.  Need thread but NOT official VM entry
2674   // crud.  We cannot block on this call, no GC can happen.  Call should
2675   // capture callee-saved registers as well as return values.
2676   // Thread is in rdi already.
2677   //
2678   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2679 
2680   __ mov(c_rarg0, r15_thread);
2681   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2682   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2683 
2684   // Set an oopmap for the call site
2685   OopMapSet* oop_maps = new OopMapSet();
2686   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2687 
2688   // location of rbp is known implicitly by the frame sender code
2689 
2690   oop_maps->add_gc_map(__ pc() - start, map);
2691 
2692   __ reset_last_Java_frame(false);
2693 
2694   // Load UnrollBlock* into rdi
2695   __ mov(rdi, rax);
2696 
2697 #ifdef ASSERT
2698   { Label L;
2699     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2700             (int32_t)Deoptimization::Unpack_uncommon_trap);
2701     __ jcc(Assembler::equal, L);
2702     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2703     __ bind(L);
2704   }
2705 #endif
2706 
2707   // Pop all the frames we must move/replace.
2708   //
2709   // Frame picture (youngest to oldest)
2710   // 1: self-frame (no frame link)
2711   // 2: deopting frame  (no frame link)
2712   // 3: caller of deopting frame (could be compiled/interpreted).
2713 
2714   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2715   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2716 
2717   // Pop deoptimized frame (int)
2718   __ movl(rcx, Address(rdi,
2719                        Deoptimization::UnrollBlock::
2720                        size_of_deoptimized_frame_offset_in_bytes()));
2721   __ addptr(rsp, rcx);
2722 
2723   // rsp should be pointing at the return address to the caller (3)
2724 
2725   // Pick up the initial fp we should save
2726   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2727   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2728 
2729 #ifdef ASSERT
2730   // Compilers generate code that bang the stack by as much as the
2731   // interpreter would need. So this stack banging should never
2732   // trigger a fault. Verify that it does not on non product builds.
2733   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2734   __ bang_stack_size(rbx, rcx);
2735 #endif
2736 
2737   // Load address of array of frame pcs into rcx (address*)
2738   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2739 
2740   // Trash the return pc
2741   __ addptr(rsp, wordSize);
2742 
2743   // Load address of array of frame sizes into rsi (intptr_t*)
2744   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2745 
2746   // Counter
2747   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2748 
2749   // Now adjust the caller's stack to make up for the extra locals but
2750   // record the original sp so that we can save it in the skeletal
2751   // interpreter frame and the stack walking of interpreter_sender
2752   // will get the unextended sp value and not the "real" sp value.
2753 
2754   const Register sender_sp = r8;
2755 
2756   __ mov(sender_sp, rsp);
2757   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2758   __ subptr(rsp, rbx);
2759 
2760   // Push interpreter frames in a loop
2761   Label loop;
2762   __ bind(loop);
2763   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2764   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2765   __ pushptr(Address(rcx, 0));     // Save return address
2766   __ enter();                      // Save old & set new rbp
2767   __ subptr(rsp, rbx);             // Prolog
2768   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2769             sender_sp);            // Make it walkable
2770   // This value is corrected by layout_activation_impl
2771   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2772   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
2773   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
2774   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
2775   __ decrementl(rdx);              // Decrement counter
2776   __ jcc(Assembler::notZero, loop);
2777   __ pushptr(Address(rcx, 0));     // Save final return address
2778 
2779   // Re-push self-frame
2780   __ enter();                 // Save old & set new rbp
2781   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2782                               // Prolog
2783 
2784   // Use rbp because the frames look interpreted now
2785   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2786   // Don't need the precise return PC here, just precise enough to point into this code blob.
2787   address the_pc = __ pc();
2788   __ set_last_Java_frame(noreg, rbp, the_pc);
2789 
2790   // Call C code.  Need thread but NOT official VM entry
2791   // crud.  We cannot block on this call, no GC can happen.  Call should
2792   // restore return values to their stack-slots with the new SP.
2793   // Thread is in rdi already.
2794   //
2795   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2796 
2797   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2798   __ mov(c_rarg0, r15_thread);
2799   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2800   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2801 
2802   // Set an oopmap for the call site
2803   // Use the same PC we used for the last java frame
2804   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2805 
2806   // Clear fp AND pc
2807   __ reset_last_Java_frame(true);
2808 
2809   // Pop self-frame.
2810   __ leave();                 // Epilog
2811 
2812   // Jump to interpreter
2813   __ ret(0);
2814 
2815   // Make sure all code is generated
2816   masm->flush();
2817 
2818   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
2819                                                  SimpleRuntimeFrame::framesize >> 1);
2820 }
2821 #endif // COMPILER2
2822 
2823 //------------------------------generate_handler_blob------
2824 //
2825 // Generate a special Compile2Runtime blob that saves all registers,
2826 // and setup oopmap.
2827 //
2828 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
2829   assert(StubRoutines::forward_exception_entry() != NULL,
2830          "must be generated before");
2831 
2832   ResourceMark rm;
2833   OopMapSet *oop_maps = new OopMapSet();
2834   OopMap* map;
2835 
2836   // Allocate space for the code.  Setup code generation tools.
2837   CodeBuffer buffer("handler_blob", 2048, 1024);
2838   MacroAssembler* masm = new MacroAssembler(&buffer);
2839 
2840   address start   = __ pc();
2841   address call_pc = NULL;
2842   int frame_size_in_words;
2843   bool cause_return = (poll_type == POLL_AT_RETURN);
2844   bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
2845 
2846   if (UseRTMLocking) {
2847     // Abort RTM transaction before calling runtime
2848     // because critical section will be large and will be
2849     // aborted anyway. Also nmethod could be deoptimized.
2850     __ xabort(0);
2851   }
2852 
2853   // Make room for return address (or push it again)
2854   if (!cause_return) {
2855     __ push(rbx);
2856   }
2857 
2858   // Save registers, fpu state, and flags
2859   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
2860 
2861   // The following is basically a call_VM.  However, we need the precise
2862   // address of the call in order to generate an oopmap. Hence, we do all the
2863   // work outselves.
2864 
2865   __ set_last_Java_frame(noreg, noreg, NULL);
2866 
2867   // The return address must always be correct so that frame constructor never
2868   // sees an invalid pc.
2869 
2870   if (!cause_return) {
2871     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2872     // Additionally, rbx is a callee saved register and we can look at it later to determine
2873     // if someone changed the return address for us!
2874     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
2875     __ movptr(Address(rbp, wordSize), rbx);
2876   }
2877 
2878   // Do the call
2879   __ mov(c_rarg0, r15_thread);
2880   __ call(RuntimeAddress(call_ptr));
2881 
2882   // Set an oopmap for the call site.  This oopmap will map all
2883   // oop-registers and debug-info registers as callee-saved.  This
2884   // will allow deoptimization at this safepoint to find all possible
2885   // debug-info recordings, as well as let GC find all oops.
2886 
2887   oop_maps->add_gc_map( __ pc() - start, map);
2888 
2889   Label noException;
2890 
2891   __ reset_last_Java_frame(false);
2892 
2893   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
2894   __ jcc(Assembler::equal, noException);
2895 
2896   // Exception pending
2897 
2898   RegisterSaver::restore_live_registers(masm, save_vectors);
2899 
2900   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2901 
2902   // No exception case
2903   __ bind(noException);
2904 
2905   Label no_adjust;
2906 #ifdef ASSERT
2907   Label bail;
2908 #endif
2909   if (!cause_return) {
2910     Label no_prefix, not_special;
2911 
2912     // If our stashed return pc was modified by the runtime we avoid touching it
2913     __ cmpptr(rbx, Address(rbp, wordSize));
2914     __ jccb(Assembler::notEqual, no_adjust);
2915 
2916     // Skip over the poll instruction.
2917     // See NativeInstruction::is_safepoint_poll()
2918     // Possible encodings:
2919     //      85 00       test   %eax,(%rax)
2920     //      85 01       test   %eax,(%rcx)
2921     //      85 02       test   %eax,(%rdx)
2922     //      85 03       test   %eax,(%rbx)
2923     //      85 06       test   %eax,(%rsi)
2924     //      85 07       test   %eax,(%rdi)
2925     //
2926     //   41 85 00       test   %eax,(%r8)
2927     //   41 85 01       test   %eax,(%r9)
2928     //   41 85 02       test   %eax,(%r10)
2929     //   41 85 03       test   %eax,(%r11)
2930     //   41 85 06       test   %eax,(%r14)
2931     //   41 85 07       test   %eax,(%r15)
2932     //
2933     //      85 04 24    test   %eax,(%rsp)
2934     //   41 85 04 24    test   %eax,(%r12)
2935     //      85 45 00    test   %eax,0x0(%rbp)
2936     //   41 85 45 00    test   %eax,0x0(%r13)
2937 
2938     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
2939     __ jcc(Assembler::notEqual, no_prefix);
2940     __ addptr(rbx, 1);
2941     __ bind(no_prefix);
2942 #ifdef ASSERT
2943     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
2944 #endif
2945     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
2946     // r12/rsp 0x04
2947     // r13/rbp 0x05
2948     __ movzbq(rcx, Address(rbx, 1));
2949     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
2950     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
2951     __ cmpptr(rcx, 1);
2952     __ jcc(Assembler::above, not_special);
2953     __ addptr(rbx, 1);
2954     __ bind(not_special);
2955 #ifdef ASSERT
2956     // Verify the correct encoding of the poll we're about to skip.
2957     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
2958     __ jcc(Assembler::notEqual, bail);
2959     // Mask out the modrm bits
2960     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
2961     // rax encodes to 0, so if the bits are nonzero it's incorrect
2962     __ jcc(Assembler::notZero, bail);
2963 #endif
2964     // Adjust return pc forward to step over the safepoint poll instruction
2965     __ addptr(rbx, 2);
2966     __ movptr(Address(rbp, wordSize), rbx);
2967   }
2968 
2969   __ bind(no_adjust);
2970   // Normal exit, restore registers and exit.
2971   RegisterSaver::restore_live_registers(masm, save_vectors);
2972   __ ret(0);
2973 
2974 #ifdef ASSERT
2975   __ bind(bail);
2976   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
2977 #endif
2978 
2979   // Make sure all code is generated
2980   masm->flush();
2981 
2982   // Fill-out other meta info
2983   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
2984 }
2985 
2986 //
2987 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
2988 //
2989 // Generate a stub that calls into vm to find out the proper destination
2990 // of a java call. All the argument registers are live at this point
2991 // but since this is generic code we don't know what they are and the caller
2992 // must do any gc of the args.
2993 //
2994 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
2995   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
2996 
2997   // allocate space for the code
2998   ResourceMark rm;
2999 
3000   CodeBuffer buffer(name, 1000, 512);
3001   MacroAssembler* masm                = new MacroAssembler(&buffer);
3002 
3003   int frame_size_in_words;
3004 
3005   OopMapSet *oop_maps = new OopMapSet();
3006   OopMap* map = NULL;
3007 
3008   int start = __ offset();
3009 
3010   // No need to save vector registers since they are caller-saved anyway.
3011   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
3012 
3013   int frame_complete = __ offset();
3014 
3015   __ set_last_Java_frame(noreg, noreg, NULL);
3016 
3017   __ mov(c_rarg0, r15_thread);
3018 
3019   __ call(RuntimeAddress(destination));
3020 
3021 
3022   // Set an oopmap for the call site.
3023   // We need this not only for callee-saved registers, but also for volatile
3024   // registers that the compiler might be keeping live across a safepoint.
3025 
3026   oop_maps->add_gc_map( __ offset() - start, map);
3027 
3028   // rax contains the address we are going to jump to assuming no exception got installed
3029 
3030   // clear last_Java_sp
3031   __ reset_last_Java_frame(false);
3032   // check for pending exceptions
3033   Label pending;
3034   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3035   __ jcc(Assembler::notEqual, pending);
3036 
3037   // get the returned Method*
3038   __ get_vm_result_2(rbx, r15_thread);
3039   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3040 
3041   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3042 
3043   RegisterSaver::restore_live_registers(masm);
3044 
3045   // We are back the the original state on entry and ready to go.
3046 
3047   __ jmp(rax);
3048 
3049   // Pending exception after the safepoint
3050 
3051   __ bind(pending);
3052 
3053   RegisterSaver::restore_live_registers(masm);
3054 
3055   // exception pending => remove activation and forward to exception handler
3056 
3057   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3058 
3059   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3060   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3061 
3062   // -------------
3063   // make sure all code is generated
3064   masm->flush();
3065 
3066   // return the  blob
3067   // frame_size_words or bytes??
3068   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3069 }
3070 
3071 //------------------------------Montgomery multiplication------------------------
3072 //
3073 
3074 #ifndef _WINDOWS
3075 
3076 // Subtract 0:b from carry:a.  Return carry.
3077 static julong
3078 sub(julong a[], julong b[], julong carry, long len) {
3079   long long i = 0, cnt = len;
3080   julong tmp;
3081   asm volatile("clc; "
3082                "0: ; "
3083                "mov (%[b], %[i], 8), %[tmp]; "
3084                "sbb %[tmp], (%[a], %[i], 8); "
3085                "inc %[i]; dec %[cnt]; "
3086                "jne 0b; "
3087                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3088                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3089                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3090                : "memory");
3091   return tmp;
3092 }
3093 
3094 // Multiply (unsigned) Long A by Long B, accumulating the double-
3095 // length result into the accumulator formed of T0, T1, and T2.
3096 #define MACC(A, B, T0, T1, T2)                                  \
3097 do {                                                            \
3098   unsigned long hi, lo;                                         \
3099   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3100            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3101            : "r"(A), "a"(B) : "cc");                            \
3102  } while(0)
3103 
3104 // As above, but add twice the double-length result into the
3105 // accumulator.
3106 #define MACC2(A, B, T0, T1, T2)                                 \
3107 do {                                                            \
3108   unsigned long hi, lo;                                         \
3109   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3110            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3111            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3112            : "r"(A), "a"(B) : "cc");                            \
3113  } while(0)
3114 
3115 #else //_WINDOWS
3116 
3117 static julong
3118 sub(julong a[], julong b[], julong carry, long len) {
3119   long i;
3120   julong tmp;
3121   unsigned char c = 1;
3122   for (i = 0; i < len; i++) {
3123     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3124     a[i] = tmp;
3125   }
3126   c = _addcarry_u64(c, carry, ~0, &tmp);
3127   return tmp;
3128 }
3129 
3130 // Multiply (unsigned) Long A by Long B, accumulating the double-
3131 // length result into the accumulator formed of T0, T1, and T2.
3132 #define MACC(A, B, T0, T1, T2)                          \
3133 do {                                                    \
3134   julong hi, lo;                            \
3135   lo = _umul128(A, B, &hi);                             \
3136   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3137   c = _addcarry_u64(c, hi, T1, &T1);                    \
3138   _addcarry_u64(c, T2, 0, &T2);                         \
3139  } while(0)
3140 
3141 // As above, but add twice the double-length result into the
3142 // accumulator.
3143 #define MACC2(A, B, T0, T1, T2)                         \
3144 do {                                                    \
3145   julong hi, lo;                            \
3146   lo = _umul128(A, B, &hi);                             \
3147   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3148   c = _addcarry_u64(c, hi, T1, &T1);                    \
3149   _addcarry_u64(c, T2, 0, &T2);                         \
3150   c = _addcarry_u64(0, lo, T0, &T0);                    \
3151   c = _addcarry_u64(c, hi, T1, &T1);                    \
3152   _addcarry_u64(c, T2, 0, &T2);                         \
3153  } while(0)
3154 
3155 #endif //_WINDOWS
3156 
3157 // Fast Montgomery multiplication.  The derivation of the algorithm is
3158 // in  A Cryptographic Library for the Motorola DSP56000,
3159 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3160 
3161 static void NOINLINE
3162 montgomery_multiply(julong a[], julong b[], julong n[],
3163                     julong m[], julong inv, int len) {
3164   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3165   int i;
3166 
3167   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3168 
3169   for (i = 0; i < len; i++) {
3170     int j;
3171     for (j = 0; j < i; j++) {
3172       MACC(a[j], b[i-j], t0, t1, t2);
3173       MACC(m[j], n[i-j], t0, t1, t2);
3174     }
3175     MACC(a[i], b[0], t0, t1, t2);
3176     m[i] = t0 * inv;
3177     MACC(m[i], n[0], t0, t1, t2);
3178 
3179     assert(t0 == 0, "broken Montgomery multiply");
3180 
3181     t0 = t1; t1 = t2; t2 = 0;
3182   }
3183 
3184   for (i = len; i < 2*len; i++) {
3185     int j;
3186     for (j = i-len+1; j < len; j++) {
3187       MACC(a[j], b[i-j], t0, t1, t2);
3188       MACC(m[j], n[i-j], t0, t1, t2);
3189     }
3190     m[i-len] = t0;
3191     t0 = t1; t1 = t2; t2 = 0;
3192   }
3193 
3194   while (t0)
3195     t0 = sub(m, n, t0, len);
3196 }
3197 
3198 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3199 // multiplies so it should be up to 25% faster than Montgomery
3200 // multiplication.  However, its loop control is more complex and it
3201 // may actually run slower on some machines.
3202 
3203 static void NOINLINE
3204 montgomery_square(julong a[], julong n[],
3205                   julong m[], julong inv, int len) {
3206   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3207   int i;
3208 
3209   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3210 
3211   for (i = 0; i < len; i++) {
3212     int j;
3213     int end = (i+1)/2;
3214     for (j = 0; j < end; j++) {
3215       MACC2(a[j], a[i-j], t0, t1, t2);
3216       MACC(m[j], n[i-j], t0, t1, t2);
3217     }
3218     if ((i & 1) == 0) {
3219       MACC(a[j], a[j], t0, t1, t2);
3220     }
3221     for (; j < i; j++) {
3222       MACC(m[j], n[i-j], t0, t1, t2);
3223     }
3224     m[i] = t0 * inv;
3225     MACC(m[i], n[0], t0, t1, t2);
3226 
3227     assert(t0 == 0, "broken Montgomery square");
3228 
3229     t0 = t1; t1 = t2; t2 = 0;
3230   }
3231 
3232   for (i = len; i < 2*len; i++) {
3233     int start = i-len+1;
3234     int end = start + (len - start)/2;
3235     int j;
3236     for (j = start; j < end; j++) {
3237       MACC2(a[j], a[i-j], t0, t1, t2);
3238       MACC(m[j], n[i-j], t0, t1, t2);
3239     }
3240     if ((i & 1) == 0) {
3241       MACC(a[j], a[j], t0, t1, t2);
3242     }
3243     for (; j < len; j++) {
3244       MACC(m[j], n[i-j], t0, t1, t2);
3245     }
3246     m[i-len] = t0;
3247     t0 = t1; t1 = t2; t2 = 0;
3248   }
3249 
3250   while (t0)
3251     t0 = sub(m, n, t0, len);
3252 }
3253 
3254 // Swap words in a longword.
3255 static julong swap(julong x) {
3256   return (x << 32) | (x >> 32);
3257 }
3258 
3259 // Copy len longwords from s to d, word-swapping as we go.  The
3260 // destination array is reversed.
3261 static void reverse_words(julong *s, julong *d, int len) {
3262   d += len;
3263   while(len-- > 0) {
3264     d--;
3265     *d = swap(*s);
3266     s++;
3267   }
3268 }
3269 
3270 // The threshold at which squaring is advantageous was determined
3271 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3272 #define MONTGOMERY_SQUARING_THRESHOLD 64
3273 
3274 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3275                                         jint len, jlong inv,
3276                                         jint *m_ints) {
3277   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3278   int longwords = len/2;
3279 
3280   // Make very sure we don't use so much space that the stack might
3281   // overflow.  512 jints corresponds to an 16384-bit integer and
3282   // will use here a total of 8k bytes of stack space.
3283   int total_allocation = longwords * sizeof (julong) * 4;
3284   guarantee(total_allocation <= 8192, "must be");
3285   julong *scratch = (julong *)alloca(total_allocation);
3286 
3287   // Local scratch arrays
3288   julong
3289     *a = scratch + 0 * longwords,
3290     *b = scratch + 1 * longwords,
3291     *n = scratch + 2 * longwords,
3292     *m = scratch + 3 * longwords;
3293 
3294   reverse_words((julong *)a_ints, a, longwords);
3295   reverse_words((julong *)b_ints, b, longwords);
3296   reverse_words((julong *)n_ints, n, longwords);
3297 
3298   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3299 
3300   reverse_words(m, (julong *)m_ints, longwords);
3301 }
3302 
3303 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3304                                       jint len, jlong inv,
3305                                       jint *m_ints) {
3306   assert(len % 2 == 0, "array length in montgomery_square must be even");
3307   int longwords = len/2;
3308 
3309   // Make very sure we don't use so much space that the stack might
3310   // overflow.  512 jints corresponds to an 16384-bit integer and
3311   // will use here a total of 6k bytes of stack space.
3312   int total_allocation = longwords * sizeof (julong) * 3;
3313   guarantee(total_allocation <= 8192, "must be");
3314   julong *scratch = (julong *)alloca(total_allocation);
3315 
3316   // Local scratch arrays
3317   julong
3318     *a = scratch + 0 * longwords,
3319     *n = scratch + 1 * longwords,
3320     *m = scratch + 2 * longwords;
3321 
3322   reverse_words((julong *)a_ints, a, longwords);
3323   reverse_words((julong *)n_ints, n, longwords);
3324 
3325   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3326     ::montgomery_square(a, n, m, (julong)inv, longwords);
3327   } else {
3328     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3329   }
3330 
3331   reverse_words(m, (julong *)m_ints, longwords);
3332 }
3333 
3334 #ifdef COMPILER2
3335 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3336 //
3337 //------------------------------generate_exception_blob---------------------------
3338 // creates exception blob at the end
3339 // Using exception blob, this code is jumped from a compiled method.
3340 // (see emit_exception_handler in x86_64.ad file)
3341 //
3342 // Given an exception pc at a call we call into the runtime for the
3343 // handler in this method. This handler might merely restore state
3344 // (i.e. callee save registers) unwind the frame and jump to the
3345 // exception handler for the nmethod if there is no Java level handler
3346 // for the nmethod.
3347 //
3348 // This code is entered with a jmp.
3349 //
3350 // Arguments:
3351 //   rax: exception oop
3352 //   rdx: exception pc
3353 //
3354 // Results:
3355 //   rax: exception oop
3356 //   rdx: exception pc in caller or ???
3357 //   destination: exception handler of caller
3358 //
3359 // Note: the exception pc MUST be at a call (precise debug information)
3360 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3361 //
3362 
3363 void OptoRuntime::generate_exception_blob() {
3364   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3365   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3366   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3367 
3368   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3369 
3370   // Allocate space for the code
3371   ResourceMark rm;
3372   // Setup code generation tools
3373   CodeBuffer buffer("exception_blob", 2048, 1024);
3374   MacroAssembler* masm = new MacroAssembler(&buffer);
3375 
3376 
3377   address start = __ pc();
3378 
3379   // Exception pc is 'return address' for stack walker
3380   __ push(rdx);
3381   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3382 
3383   // Save callee-saved registers.  See x86_64.ad.
3384 
3385   // rbp is an implicitly saved callee saved register (i.e., the calling
3386   // convention will save/restore it in the prolog/epilog). Other than that
3387   // there are no callee save registers now that adapter frames are gone.
3388 
3389   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3390 
3391   // Store exception in Thread object. We cannot pass any arguments to the
3392   // handle_exception call, since we do not want to make any assumption
3393   // about the size of the frame where the exception happened in.
3394   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3395   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3396   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3397 
3398   // This call does all the hard work.  It checks if an exception handler
3399   // exists in the method.
3400   // If so, it returns the handler address.
3401   // If not, it prepares for stack-unwinding, restoring the callee-save
3402   // registers of the frame being removed.
3403   //
3404   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3405 
3406   // At a method handle call, the stack may not be properly aligned
3407   // when returning with an exception.
3408   address the_pc = __ pc();
3409   __ set_last_Java_frame(noreg, noreg, the_pc);
3410   __ mov(c_rarg0, r15_thread);
3411   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3412   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3413 
3414   // Set an oopmap for the call site.  This oopmap will only be used if we
3415   // are unwinding the stack.  Hence, all locations will be dead.
3416   // Callee-saved registers will be the same as the frame above (i.e.,
3417   // handle_exception_stub), since they were restored when we got the
3418   // exception.
3419 
3420   OopMapSet* oop_maps = new OopMapSet();
3421 
3422   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3423 
3424   __ reset_last_Java_frame(false);
3425 
3426   // Restore callee-saved registers
3427 
3428   // rbp is an implicitly saved callee-saved register (i.e., the calling
3429   // convention will save restore it in prolog/epilog) Other than that
3430   // there are no callee save registers now that adapter frames are gone.
3431 
3432   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3433 
3434   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3435   __ pop(rdx);                  // No need for exception pc anymore
3436 
3437   // rax: exception handler
3438 
3439   // We have a handler in rax (could be deopt blob).
3440   __ mov(r8, rax);
3441 
3442   // Get the exception oop
3443   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3444   // Get the exception pc in case we are deoptimized
3445   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3446 #ifdef ASSERT
3447   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3448   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3449 #endif
3450   // Clear the exception oop so GC no longer processes it as a root.
3451   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3452 
3453   // rax: exception oop
3454   // r8:  exception handler
3455   // rdx: exception pc
3456   // Jump to handler
3457 
3458   __ jmp(r8);
3459 
3460   // Make sure all code is generated
3461   masm->flush();
3462 
3463   // Set exception blob
3464   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3465 }
3466 #endif // COMPILER2
3467