1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/icBuffer.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/compiledICHolder.hpp"
  46 #include "oops/klass.inline.hpp"
  47 #include "prims/methodHandles.hpp"
  48 #include "runtime/jniHandles.hpp"
  49 #include "runtime/safepointMechanism.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/signature.hpp"
  52 #include "runtime/stubRoutines.hpp"
  53 #include "runtime/vframeArray.hpp"
  54 #include "runtime/vm_version.hpp"
  55 #include "utilities/align.hpp"
  56 #include "utilities/formatBuffer.hpp"
  57 #include "vmreg_x86.inline.hpp"
  58 #ifdef COMPILER1
  59 #include "c1/c1_Runtime1.hpp"
  60 #endif
  61 #ifdef COMPILER2
  62 #include "opto/runtime.hpp"
  63 #endif
  64 #if INCLUDE_JVMCI
  65 #include "jvmci/jvmciJavaClasses.hpp"
  66 #endif
  67 
  68 #define __ masm->
  69 
  70 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  71 
  72 class SimpleRuntimeFrame {
  73 
  74   public:
  75 
  76   // Most of the runtime stubs have this simple frame layout.
  77   // This class exists to make the layout shared in one place.
  78   // Offsets are for compiler stack slots, which are jints.
  79   enum layout {
  80     // The frame sender code expects that rbp will be in the "natural" place and
  81     // will override any oopMap setting for it. We must therefore force the layout
  82     // so that it agrees with the frame sender code.
  83     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  84     rbp_off2,
  85     return_off, return_off2,
  86     framesize
  87   };
  88 };
  89 
  90 class RegisterSaver {
  91   // Capture info about frame layout.  Layout offsets are in jint
  92   // units because compiler frame slots are jints.
  93 #define XSAVE_AREA_BEGIN 160
  94 #define XSAVE_AREA_YMM_BEGIN 576
  95 #define XSAVE_AREA_OPMASK_BEGIN 1088
  96 #define XSAVE_AREA_ZMM_BEGIN 1152
  97 #define XSAVE_AREA_UPPERBANK 1664
  98 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  99 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 100 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 101 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 102 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 103   enum layout {
 104     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 105     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 106     DEF_XMM_OFFS(0),
 107     DEF_XMM_OFFS(1),
 108     // 2..15 are implied in range usage
 109     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 110     DEF_YMM_OFFS(0),
 111     DEF_YMM_OFFS(1),
 112     // 2..15 are implied in range usage
 113     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 114     DEF_OPMASK_OFFS(0),
 115     DEF_OPMASK_OFFS(1),
 116     // 2..7 are implied in range usage
 117     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 118     DEF_ZMM_OFFS(0),
 119     DEF_ZMM_OFFS(1),
 120     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 121     DEF_ZMM_UPPER_OFFS(16),
 122     DEF_ZMM_UPPER_OFFS(17),
 123     // 18..31 are implied in range usage
 124     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 125     fpu_stateH_end,
 126     r15_off, r15H_off,
 127     r14_off, r14H_off,
 128     r13_off, r13H_off,
 129     r12_off, r12H_off,
 130     r11_off, r11H_off,
 131     r10_off, r10H_off,
 132     r9_off,  r9H_off,
 133     r8_off,  r8H_off,
 134     rdi_off, rdiH_off,
 135     rsi_off, rsiH_off,
 136     ignore_off, ignoreH_off,  // extra copy of rbp
 137     rsp_off, rspH_off,
 138     rbx_off, rbxH_off,
 139     rdx_off, rdxH_off,
 140     rcx_off, rcxH_off,
 141     rax_off, raxH_off,
 142     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 143     align_off, alignH_off,
 144     flags_off, flagsH_off,
 145     // The frame sender code expects that rbp will be in the "natural" place and
 146     // will override any oopMap setting for it. We must therefore force the layout
 147     // so that it agrees with the frame sender code.
 148     rbp_off, rbpH_off,        // copy of rbp we will restore
 149     return_off, returnH_off,  // slot for return address
 150     reg_save_size             // size in compiler stack slots
 151   };
 152 
 153  public:
 154   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors);
 155   static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 156 
 157   // Offsets into the register save area
 158   // Used by deoptimization when it is managing result register
 159   // values on its own
 160 
 161   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 162   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 163   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 164   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 165   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 166 
 167   // During deoptimization only the result registers need to be restored,
 168   // all the other values have already been extracted.
 169   static void restore_result_registers(MacroAssembler* masm);
 170 };
 171 
 172 // Register is a class, but it would be assigned numerical value.
 173 // "0" is assigned for rax. Thus we need to ignore -Wnonnull.
 174 PRAGMA_DIAG_PUSH
 175 PRAGMA_NONNULL_IGNORED
 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
 177   int off = 0;
 178   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 179   if (UseAVX < 3) {
 180     num_xmm_regs = num_xmm_regs/2;
 181   }
 182 #if COMPILER2_OR_JVMCI
 183   if (save_vectors && UseAVX == 0) {
 184     save_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 185   }
 186   assert(!save_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 187 #else
 188   save_vectors = false; // vectors are generated only by C2 and JVMCI
 189 #endif
 190 
 191   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 192   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 193   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 194   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 195   // CodeBlob frame size is in words.
 196   int frame_size_in_words = frame_size_in_bytes / wordSize;
 197   *total_frame_words = frame_size_in_words;
 198 
 199   // Save registers, fpu state, and flags.
 200   // We assume caller has already pushed the return address onto the
 201   // stack, so rsp is 8-byte aligned here.
 202   // We push rpb twice in this sequence because we want the real rbp
 203   // to be under the return like a normal enter.
 204 
 205   __ enter();          // rsp becomes 16-byte aligned here
 206   __ push_CPU_state(); // Push a multiple of 16 bytes
 207 
 208   // push cpu state handles this on EVEX enabled targets
 209   if (save_vectors) {
 210     // Save upper half of YMM registers(0..15)
 211     int base_addr = XSAVE_AREA_YMM_BEGIN;
 212     for (int n = 0; n < 16; n++) {
 213       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 214     }
 215     if (VM_Version::supports_evex()) {
 216       // Save upper half of ZMM registers(0..15)
 217       base_addr = XSAVE_AREA_ZMM_BEGIN;
 218       for (int n = 0; n < 16; n++) {
 219         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 220       }
 221       // Save full ZMM registers(16..num_xmm_regs)
 222       base_addr = XSAVE_AREA_UPPERBANK;
 223       off = 0;
 224       int vector_len = Assembler::AVX_512bit;
 225       for (int n = 16; n < num_xmm_regs; n++) {
 226         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 227       }
 228 #if COMPILER2_OR_JVMCI
 229       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 230       off = 0;
 231       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 232         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 233       }
 234 #endif
 235     }
 236   } else {
 237     if (VM_Version::supports_evex()) {
 238       // Save upper bank of ZMM registers(16..31) for double/float usage
 239       int base_addr = XSAVE_AREA_UPPERBANK;
 240       off = 0;
 241       for (int n = 16; n < num_xmm_regs; n++) {
 242         __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
 243       }
 244 #if COMPILER2_OR_JVMCI
 245       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 246       off = 0;
 247       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 248         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 249       }
 250 #endif
 251     }
 252   }
 253   __ vzeroupper();
 254   if (frame::arg_reg_save_area_bytes != 0) {
 255     // Allocate argument register save area
 256     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 257   }
 258 
 259   // Set an oopmap for the call site.  This oopmap will map all
 260   // oop-registers and debug-info registers as callee-saved.  This
 261   // will allow deoptimization at this safepoint to find all possible
 262   // debug-info recordings, as well as let GC find all oops.
 263 
 264   OopMapSet *oop_maps = new OopMapSet();
 265   OopMap* map = new OopMap(frame_size_in_slots, 0);
 266 
 267 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 268 
 269   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 271   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 272   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 273   // rbp location is known implicitly by the frame sender code, needs no oopmap
 274   // and the location where rbp was saved by is ignored
 275   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 283   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 284   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 285   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 286   // on EVEX enabled targets, we get it included in the xsave area
 287   off = xmm0_off;
 288   int delta = xmm1_off - off;
 289   for (int n = 0; n < 16; n++) {
 290     XMMRegister xmm_name = as_XMMRegister(n);
 291     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 292     off += delta;
 293   }
 294   if (UseAVX > 2) {
 295     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 296     off = zmm16_off;
 297     delta = zmm17_off - off;
 298     for (int n = 16; n < num_xmm_regs; n++) {
 299       XMMRegister zmm_name = as_XMMRegister(n);
 300       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 301       off += delta;
 302     }
 303   }
 304 
 305 #if COMPILER2_OR_JVMCI
 306   if (save_vectors) {
 307     // Save upper half of YMM registers(0..15)
 308     off = ymm0_off;
 309     delta = ymm1_off - ymm0_off;
 310     for (int n = 0; n < 16; n++) {
 311       XMMRegister ymm_name = as_XMMRegister(n);
 312       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 313       off += delta;
 314     }
 315     if (VM_Version::supports_evex()) {
 316       // Save upper half of ZMM registers(0..15)
 317       off = zmm0_off;
 318       delta = zmm1_off - zmm0_off;
 319       for (int n = 0; n < 16; n++) {
 320         XMMRegister zmm_name = as_XMMRegister(n);
 321         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 322         off += delta;
 323       }
 324     }
 325   }
 326 #endif // COMPILER2_OR_JVMCI
 327 
 328   // %%% These should all be a waste but we'll keep things as they were for now
 329   if (true) {
 330     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 334     // rbp location is known implicitly by the frame sender code, needs no oopmap
 335     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 343     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 344     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 345     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 346     // on EVEX enabled targets, we get it included in the xsave area
 347     off = xmm0H_off;
 348     delta = xmm1H_off - off;
 349     for (int n = 0; n < 16; n++) {
 350       XMMRegister xmm_name = as_XMMRegister(n);
 351       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 352       off += delta;
 353     }
 354     if (UseAVX > 2) {
 355       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 356       off = zmm16H_off;
 357       delta = zmm17H_off - off;
 358       for (int n = 16; n < num_xmm_regs; n++) {
 359         XMMRegister zmm_name = as_XMMRegister(n);
 360         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 361         off += delta;
 362       }
 363     }
 364   }
 365 
 366   return map;
 367 }
 368 PRAGMA_DIAG_POP
 369 
 370 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
 371   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 372   if (UseAVX < 3) {
 373     num_xmm_regs = num_xmm_regs/2;
 374   }
 375   if (frame::arg_reg_save_area_bytes != 0) {
 376     // Pop arg register save area
 377     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 378   }
 379 
 380 #if COMPILER2_OR_JVMCI
 381   if (restore_vectors) {
 382     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 383     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 384   }
 385 #else
 386   assert(!restore_vectors, "vectors are generated only by C2");
 387 #endif
 388 
 389   __ vzeroupper();
 390 
 391   // On EVEX enabled targets everything is handled in pop fpu state
 392   if (restore_vectors) {
 393     // Restore upper half of YMM registers (0..15)
 394     int base_addr = XSAVE_AREA_YMM_BEGIN;
 395     for (int n = 0; n < 16; n++) {
 396       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 397     }
 398     if (VM_Version::supports_evex()) {
 399       // Restore upper half of ZMM registers (0..15)
 400       base_addr = XSAVE_AREA_ZMM_BEGIN;
 401       for (int n = 0; n < 16; n++) {
 402         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 403       }
 404       // Restore full ZMM registers(16..num_xmm_regs)
 405       base_addr = XSAVE_AREA_UPPERBANK;
 406       int vector_len = Assembler::AVX_512bit;
 407       int off = 0;
 408       for (int n = 16; n < num_xmm_regs; n++) {
 409         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 410       }
 411 #if COMPILER2_OR_JVMCI
 412       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 413       off = 0;
 414       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 415         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 416       }
 417 #endif
 418     }
 419   } else {
 420     if (VM_Version::supports_evex()) {
 421       // Restore upper bank of ZMM registers(16..31) for double/float usage
 422       int base_addr = XSAVE_AREA_UPPERBANK;
 423       int off = 0;
 424       for (int n = 16; n < num_xmm_regs; n++) {
 425         __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
 426       }
 427 #if COMPILER2_OR_JVMCI
 428       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 429       off = 0;
 430       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 431         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 432       }
 433 #endif
 434     }
 435   }
 436 
 437   // Recover CPU state
 438   __ pop_CPU_state();
 439   // Get the rbp described implicitly by the calling convention (no oopMap)
 440   __ pop(rbp);
 441 }
 442 
 443 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 444 
 445   // Just restore result register. Only used by deoptimization. By
 446   // now any callee save register that needs to be restored to a c2
 447   // caller of the deoptee has been extracted into the vframeArray
 448   // and will be stuffed into the c2i adapter we create for later
 449   // restoration so only result registers need to be restored here.
 450 
 451   // Restore fp result register
 452   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 453   // Restore integer result register
 454   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 455   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 456 
 457   // Pop all of the register save are off the stack except the return address
 458   __ addptr(rsp, return_offset_in_bytes());
 459 }
 460 
 461 // Is vector's size (in bytes) bigger than a size saved by default?
 462 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 463 bool SharedRuntime::is_wide_vector(int size) {
 464   return size > 16;
 465 }
 466 
 467 // ---------------------------------------------------------------------------
 468 // Read the array of BasicTypes from a signature, and compute where the
 469 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 470 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 471 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 472 // as framesizes are fixed.
 473 // VMRegImpl::stack0 refers to the first slot 0(sp).
 474 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 475 // up to RegisterImpl::number_of_registers) are the 64-bit
 476 // integer registers.
 477 
 478 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 479 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 480 // units regardless of build. Of course for i486 there is no 64 bit build
 481 
 482 // The Java calling convention is a "shifted" version of the C ABI.
 483 // By skipping the first C ABI register we can call non-static jni methods
 484 // with small numbers of arguments without having to shuffle the arguments
 485 // at all. Since we control the java ABI we ought to at least get some
 486 // advantage out of it.
 487 
 488 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 489                                            VMRegPair *regs,
 490                                            int total_args_passed) {
 491 
 492   // Create the mapping between argument positions and
 493   // registers.
 494   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 495     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 496   };
 497   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 498     j_farg0, j_farg1, j_farg2, j_farg3,
 499     j_farg4, j_farg5, j_farg6, j_farg7
 500   };
 501 
 502 
 503   uint int_args = 0;
 504   uint fp_args = 0;
 505   uint stk_args = 0; // inc by 2 each time
 506 
 507   for (int i = 0; i < total_args_passed; i++) {
 508     switch (sig_bt[i]) {
 509     case T_BOOLEAN:
 510     case T_CHAR:
 511     case T_BYTE:
 512     case T_SHORT:
 513     case T_INT:
 514       if (int_args < Argument::n_int_register_parameters_j) {
 515         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 516       } else {
 517         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 518         stk_args += 2;
 519       }
 520       break;
 521     case T_VOID:
 522       // halves of T_LONG or T_DOUBLE
 523       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 524       regs[i].set_bad();
 525       break;
 526     case T_LONG:
 527       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 528       // fall through
 529     case T_OBJECT:
 530     case T_ARRAY:
 531     case T_ADDRESS:
 532       if (int_args < Argument::n_int_register_parameters_j) {
 533         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 534       } else {
 535         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 536         stk_args += 2;
 537       }
 538       break;
 539     case T_FLOAT:
 540       if (fp_args < Argument::n_float_register_parameters_j) {
 541         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 542       } else {
 543         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 544         stk_args += 2;
 545       }
 546       break;
 547     case T_DOUBLE:
 548       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 549       if (fp_args < Argument::n_float_register_parameters_j) {
 550         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 551       } else {
 552         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 553         stk_args += 2;
 554       }
 555       break;
 556     default:
 557       ShouldNotReachHere();
 558       break;
 559     }
 560   }
 561 
 562   return align_up(stk_args, 2);
 563 }
 564 
 565 // Patch the callers callsite with entry to compiled code if it exists.
 566 static void patch_callers_callsite(MacroAssembler *masm) {
 567   Label L;
 568   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 569   __ jcc(Assembler::equal, L);
 570 
 571   // Save the current stack pointer
 572   __ mov(r13, rsp);
 573   // Schedule the branch target address early.
 574   // Call into the VM to patch the caller, then jump to compiled callee
 575   // rax isn't live so capture return address while we easily can
 576   __ movptr(rax, Address(rsp, 0));
 577 
 578   // align stack so push_CPU_state doesn't fault
 579   __ andptr(rsp, -(StackAlignmentInBytes));
 580   __ push_CPU_state();
 581   __ vzeroupper();
 582   // VM needs caller's callsite
 583   // VM needs target method
 584   // This needs to be a long call since we will relocate this adapter to
 585   // the codeBuffer and it may not reach
 586 
 587   // Allocate argument register save area
 588   if (frame::arg_reg_save_area_bytes != 0) {
 589     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 590   }
 591   __ mov(c_rarg0, rbx);
 592   __ mov(c_rarg1, rax);
 593   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 594 
 595   // De-allocate argument register save area
 596   if (frame::arg_reg_save_area_bytes != 0) {
 597     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 598   }
 599 
 600   __ vzeroupper();
 601   __ pop_CPU_state();
 602   // restore sp
 603   __ mov(rsp, r13);
 604   __ bind(L);
 605 }
 606 
 607 
 608 static void gen_c2i_adapter(MacroAssembler *masm,
 609                             int total_args_passed,
 610                             int comp_args_on_stack,
 611                             const BasicType *sig_bt,
 612                             const VMRegPair *regs,
 613                             Label& skip_fixup) {
 614   // Before we get into the guts of the C2I adapter, see if we should be here
 615   // at all.  We've come from compiled code and are attempting to jump to the
 616   // interpreter, which means the caller made a static call to get here
 617   // (vcalls always get a compiled target if there is one).  Check for a
 618   // compiled target.  If there is one, we need to patch the caller's call.
 619   patch_callers_callsite(masm);
 620 
 621   __ bind(skip_fixup);
 622 
 623   // Since all args are passed on the stack, total_args_passed *
 624   // Interpreter::stackElementSize is the space we need. Plus 1 because
 625   // we also account for the return address location since
 626   // we store it first rather than hold it in rax across all the shuffling
 627 
 628   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 629 
 630   // stack is aligned, keep it that way
 631   extraspace = align_up(extraspace, 2*wordSize);
 632 
 633   // Get return address
 634   __ pop(rax);
 635 
 636   // set senderSP value
 637   __ mov(r13, rsp);
 638 
 639   __ subptr(rsp, extraspace);
 640 
 641   // Store the return address in the expected location
 642   __ movptr(Address(rsp, 0), rax);
 643 
 644   // Now write the args into the outgoing interpreter space
 645   for (int i = 0; i < total_args_passed; i++) {
 646     if (sig_bt[i] == T_VOID) {
 647       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 648       continue;
 649     }
 650 
 651     // offset to start parameters
 652     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 653     int next_off = st_off - Interpreter::stackElementSize;
 654 
 655     // Say 4 args:
 656     // i   st_off
 657     // 0   32 T_LONG
 658     // 1   24 T_VOID
 659     // 2   16 T_OBJECT
 660     // 3    8 T_BOOL
 661     // -    0 return address
 662     //
 663     // However to make thing extra confusing. Because we can fit a long/double in
 664     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 665     // leaves one slot empty and only stores to a single slot. In this case the
 666     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 667 
 668     VMReg r_1 = regs[i].first();
 669     VMReg r_2 = regs[i].second();
 670     if (!r_1->is_valid()) {
 671       assert(!r_2->is_valid(), "");
 672       continue;
 673     }
 674     if (r_1->is_stack()) {
 675       // memory to memory use rax
 676       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 677       if (!r_2->is_valid()) {
 678         // sign extend??
 679         __ movl(rax, Address(rsp, ld_off));
 680         __ movptr(Address(rsp, st_off), rax);
 681 
 682       } else {
 683 
 684         __ movq(rax, Address(rsp, ld_off));
 685 
 686         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 687         // T_DOUBLE and T_LONG use two slots in the interpreter
 688         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 689           // ld_off == LSW, ld_off+wordSize == MSW
 690           // st_off == MSW, next_off == LSW
 691           __ movq(Address(rsp, next_off), rax);
 692 #ifdef ASSERT
 693           // Overwrite the unused slot with known junk
 694           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 695           __ movptr(Address(rsp, st_off), rax);
 696 #endif /* ASSERT */
 697         } else {
 698           __ movq(Address(rsp, st_off), rax);
 699         }
 700       }
 701     } else if (r_1->is_Register()) {
 702       Register r = r_1->as_Register();
 703       if (!r_2->is_valid()) {
 704         // must be only an int (or less ) so move only 32bits to slot
 705         // why not sign extend??
 706         __ movl(Address(rsp, st_off), r);
 707       } else {
 708         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 709         // T_DOUBLE and T_LONG use two slots in the interpreter
 710         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 711           // long/double in gpr
 712 #ifdef ASSERT
 713           // Overwrite the unused slot with known junk
 714           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 715           __ movptr(Address(rsp, st_off), rax);
 716 #endif /* ASSERT */
 717           __ movq(Address(rsp, next_off), r);
 718         } else {
 719           __ movptr(Address(rsp, st_off), r);
 720         }
 721       }
 722     } else {
 723       assert(r_1->is_XMMRegister(), "");
 724       if (!r_2->is_valid()) {
 725         // only a float use just part of the slot
 726         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 727       } else {
 728 #ifdef ASSERT
 729         // Overwrite the unused slot with known junk
 730         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 731         __ movptr(Address(rsp, st_off), rax);
 732 #endif /* ASSERT */
 733         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 734       }
 735     }
 736   }
 737 
 738   // Schedule the branch target address early.
 739   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 740   __ jmp(rcx);
 741 }
 742 
 743 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 744                         address code_start, address code_end,
 745                         Label& L_ok) {
 746   Label L_fail;
 747   __ lea(temp_reg, ExternalAddress(code_start));
 748   __ cmpptr(pc_reg, temp_reg);
 749   __ jcc(Assembler::belowEqual, L_fail);
 750   __ lea(temp_reg, ExternalAddress(code_end));
 751   __ cmpptr(pc_reg, temp_reg);
 752   __ jcc(Assembler::below, L_ok);
 753   __ bind(L_fail);
 754 }
 755 
 756 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 757                                     int total_args_passed,
 758                                     int comp_args_on_stack,
 759                                     const BasicType *sig_bt,
 760                                     const VMRegPair *regs) {
 761 
 762   // Note: r13 contains the senderSP on entry. We must preserve it since
 763   // we may do a i2c -> c2i transition if we lose a race where compiled
 764   // code goes non-entrant while we get args ready.
 765   // In addition we use r13 to locate all the interpreter args as
 766   // we must align the stack to 16 bytes on an i2c entry else we
 767   // lose alignment we expect in all compiled code and register
 768   // save code can segv when fxsave instructions find improperly
 769   // aligned stack pointer.
 770 
 771   // Adapters can be frameless because they do not require the caller
 772   // to perform additional cleanup work, such as correcting the stack pointer.
 773   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 774   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 775   // even if a callee has modified the stack pointer.
 776   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 777   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 778   // up via the senderSP register).
 779   // In other words, if *either* the caller or callee is interpreted, we can
 780   // get the stack pointer repaired after a call.
 781   // This is why c2i and i2c adapters cannot be indefinitely composed.
 782   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 783   // both caller and callee would be compiled methods, and neither would
 784   // clean up the stack pointer changes performed by the two adapters.
 785   // If this happens, control eventually transfers back to the compiled
 786   // caller, but with an uncorrected stack, causing delayed havoc.
 787 
 788   // Pick up the return address
 789   __ movptr(rax, Address(rsp, 0));
 790 
 791   if (VerifyAdapterCalls &&
 792       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 793     // So, let's test for cascading c2i/i2c adapters right now.
 794     //  assert(Interpreter::contains($return_addr) ||
 795     //         StubRoutines::contains($return_addr),
 796     //         "i2c adapter must return to an interpreter frame");
 797     __ block_comment("verify_i2c { ");
 798     Label L_ok;
 799     if (Interpreter::code() != NULL)
 800       range_check(masm, rax, r11,
 801                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 802                   L_ok);
 803     if (StubRoutines::code1() != NULL)
 804       range_check(masm, rax, r11,
 805                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 806                   L_ok);
 807     if (StubRoutines::code2() != NULL)
 808       range_check(masm, rax, r11,
 809                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 810                   L_ok);
 811     const char* msg = "i2c adapter must return to an interpreter frame";
 812     __ block_comment(msg);
 813     __ stop(msg);
 814     __ bind(L_ok);
 815     __ block_comment("} verify_i2ce ");
 816   }
 817 
 818   // Must preserve original SP for loading incoming arguments because
 819   // we need to align the outgoing SP for compiled code.
 820   __ movptr(r11, rsp);
 821 
 822   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
 823   // in registers, we will occasionally have no stack args.
 824   int comp_words_on_stack = 0;
 825   if (comp_args_on_stack) {
 826     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
 827     // registers are below.  By subtracting stack0, we either get a negative
 828     // number (all values in registers) or the maximum stack slot accessed.
 829 
 830     // Convert 4-byte c2 stack slots to words.
 831     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 832     // Round up to miminum stack alignment, in wordSize
 833     comp_words_on_stack = align_up(comp_words_on_stack, 2);
 834     __ subptr(rsp, comp_words_on_stack * wordSize);
 835   }
 836 
 837 
 838   // Ensure compiled code always sees stack at proper alignment
 839   __ andptr(rsp, -16);
 840 
 841   // push the return address and misalign the stack that youngest frame always sees
 842   // as far as the placement of the call instruction
 843   __ push(rax);
 844 
 845   // Put saved SP in another register
 846   const Register saved_sp = rax;
 847   __ movptr(saved_sp, r11);
 848 
 849   // Will jump to the compiled code just as if compiled code was doing it.
 850   // Pre-load the register-jump target early, to schedule it better.
 851   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 852 
 853 #if INCLUDE_JVMCI
 854   if (EnableJVMCI) {
 855     // check if this call should be routed towards a specific entry point
 856     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 857     Label no_alternative_target;
 858     __ jcc(Assembler::equal, no_alternative_target);
 859     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 860     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 861     __ bind(no_alternative_target);
 862   }
 863 #endif // INCLUDE_JVMCI
 864 
 865   // Now generate the shuffle code.  Pick up all register args and move the
 866   // rest through the floating point stack top.
 867   for (int i = 0; i < total_args_passed; i++) {
 868     if (sig_bt[i] == T_VOID) {
 869       // Longs and doubles are passed in native word order, but misaligned
 870       // in the 32-bit build.
 871       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 872       continue;
 873     }
 874 
 875     // Pick up 0, 1 or 2 words from SP+offset.
 876 
 877     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 878             "scrambled load targets?");
 879     // Load in argument order going down.
 880     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 881     // Point to interpreter value (vs. tag)
 882     int next_off = ld_off - Interpreter::stackElementSize;
 883     //
 884     //
 885     //
 886     VMReg r_1 = regs[i].first();
 887     VMReg r_2 = regs[i].second();
 888     if (!r_1->is_valid()) {
 889       assert(!r_2->is_valid(), "");
 890       continue;
 891     }
 892     if (r_1->is_stack()) {
 893       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 894       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 895 
 896       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 897       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 898       // will be generated.
 899       if (!r_2->is_valid()) {
 900         // sign extend???
 901         __ movl(r13, Address(saved_sp, ld_off));
 902         __ movptr(Address(rsp, st_off), r13);
 903       } else {
 904         //
 905         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 906         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 907         // So we must adjust where to pick up the data to match the interpreter.
 908         //
 909         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 910         // are accessed as negative so LSW is at LOW address
 911 
 912         // ld_off is MSW so get LSW
 913         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 914                            next_off : ld_off;
 915         __ movq(r13, Address(saved_sp, offset));
 916         // st_off is LSW (i.e. reg.first())
 917         __ movq(Address(rsp, st_off), r13);
 918       }
 919     } else if (r_1->is_Register()) {  // Register argument
 920       Register r = r_1->as_Register();
 921       assert(r != rax, "must be different");
 922       if (r_2->is_valid()) {
 923         //
 924         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 925         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 926         // So we must adjust where to pick up the data to match the interpreter.
 927 
 928         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 929                            next_off : ld_off;
 930 
 931         // this can be a misaligned move
 932         __ movq(r, Address(saved_sp, offset));
 933       } else {
 934         // sign extend and use a full word?
 935         __ movl(r, Address(saved_sp, ld_off));
 936       }
 937     } else {
 938       if (!r_2->is_valid()) {
 939         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 940       } else {
 941         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 942       }
 943     }
 944   }
 945 
 946   __ push_cont_fastpath(r15_thread); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 947 
 948   // 6243940 We might end up in handle_wrong_method if
 949   // the callee is deoptimized as we race thru here. If that
 950   // happens we don't want to take a safepoint because the
 951   // caller frame will look interpreted and arguments are now
 952   // "compiled" so it is much better to make this transition
 953   // invisible to the stack walking code. Unfortunately if
 954   // we try and find the callee by normal means a safepoint
 955   // is possible. So we stash the desired callee in the thread
 956   // and the vm will find there should this case occur.
 957 
 958   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 959 
 960   // put Method* where a c2i would expect should we end up there
 961   // only needed becaus eof c2 resolve stubs return Method* as a result in
 962   // rax
 963   __ mov(rax, rbx);
 964   __ jmp(r11);
 965 }
 966 
 967 // ---------------------------------------------------------------
 968 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 969                                                             int total_args_passed,
 970                                                             int comp_args_on_stack,
 971                                                             const BasicType *sig_bt,
 972                                                             const VMRegPair *regs,
 973                                                             AdapterFingerPrint* fingerprint) {
 974   address i2c_entry = __ pc();
 975 
 976   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 977 
 978   // -------------------------------------------------------------------------
 979   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 980   // to the interpreter.  The args start out packed in the compiled layout.  They
 981   // need to be unpacked into the interpreter layout.  This will almost always
 982   // require some stack space.  We grow the current (compiled) stack, then repack
 983   // the args.  We  finally end in a jump to the generic interpreter entry point.
 984   // On exit from the interpreter, the interpreter will restore our SP (lest the
 985   // compiled code, which relys solely on SP and not RBP, get sick).
 986 
 987   address c2i_unverified_entry = __ pc();
 988   Label skip_fixup;
 989   Label ok;
 990 
 991   Register holder = rax;
 992   Register receiver = j_rarg0;
 993   Register temp = rbx;
 994 
 995   {
 996     __ load_klass(temp, receiver, rscratch1);
 997     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
 998     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
 999     __ jcc(Assembler::equal, ok);
1000     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1001 
1002     __ bind(ok);
1003     // Method might have been compiled since the call site was patched to
1004     // interpreted if that is the case treat it as a miss so we can get
1005     // the call site corrected.
1006     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1007     __ jcc(Assembler::equal, skip_fixup);
1008     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1009   }
1010 
1011   address c2i_entry = __ pc();
1012 
1013   // Class initialization barrier for static methods
1014   address c2i_no_clinit_check_entry = NULL;
1015   if (VM_Version::supports_fast_class_init_checks()) {
1016     Label L_skip_barrier;
1017     Register method = rbx;
1018 
1019     { // Bypass the barrier for non-static methods
1020       Register flags  = rscratch1;
1021       __ movl(flags, Address(method, Method::access_flags_offset()));
1022       __ testl(flags, JVM_ACC_STATIC);
1023       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1024     }
1025 
1026     Register klass = rscratch1;
1027     __ load_method_holder(klass, method);
1028     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1029 
1030     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1031 
1032     __ bind(L_skip_barrier);
1033     c2i_no_clinit_check_entry = __ pc();
1034   }
1035 
1036   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1037   bs->c2i_entry_barrier(masm);
1038 
1039   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1040 
1041   __ flush();
1042   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1043 }
1044 
1045 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1046                                          VMRegPair *regs,
1047                                          VMRegPair *regs2,
1048                                          int total_args_passed) {
1049   assert(regs2 == NULL, "not needed on x86");
1050 // We return the amount of VMRegImpl stack slots we need to reserve for all
1051 // the arguments NOT counting out_preserve_stack_slots.
1052 
1053 // NOTE: These arrays will have to change when c1 is ported
1054 #ifdef _WIN64
1055     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1056       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1057     };
1058     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1059       c_farg0, c_farg1, c_farg2, c_farg3
1060     };
1061 #else
1062     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1063       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1064     };
1065     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1066       c_farg0, c_farg1, c_farg2, c_farg3,
1067       c_farg4, c_farg5, c_farg6, c_farg7
1068     };
1069 #endif // _WIN64
1070 
1071 
1072     uint int_args = 0;
1073     uint fp_args = 0;
1074     uint stk_args = 0; // inc by 2 each time
1075 
1076     for (int i = 0; i < total_args_passed; i++) {
1077       switch (sig_bt[i]) {
1078       case T_BOOLEAN:
1079       case T_CHAR:
1080       case T_BYTE:
1081       case T_SHORT:
1082       case T_INT:
1083         if (int_args < Argument::n_int_register_parameters_c) {
1084           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1085 #ifdef _WIN64
1086           fp_args++;
1087           // Allocate slots for callee to stuff register args the stack.
1088           stk_args += 2;
1089 #endif
1090         } else {
1091           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1092           stk_args += 2;
1093         }
1094         break;
1095       case T_LONG:
1096         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1097         // fall through
1098       case T_OBJECT:
1099       case T_ARRAY:
1100       case T_ADDRESS:
1101       case T_METADATA:
1102         if (int_args < Argument::n_int_register_parameters_c) {
1103           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1104 #ifdef _WIN64
1105           fp_args++;
1106           stk_args += 2;
1107 #endif
1108         } else {
1109           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1110           stk_args += 2;
1111         }
1112         break;
1113       case T_FLOAT:
1114         if (fp_args < Argument::n_float_register_parameters_c) {
1115           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1116 #ifdef _WIN64
1117           int_args++;
1118           // Allocate slots for callee to stuff register args the stack.
1119           stk_args += 2;
1120 #endif
1121         } else {
1122           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1123           stk_args += 2;
1124         }
1125         break;
1126       case T_DOUBLE:
1127         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1128         if (fp_args < Argument::n_float_register_parameters_c) {
1129           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1130 #ifdef _WIN64
1131           int_args++;
1132           // Allocate slots for callee to stuff register args the stack.
1133           stk_args += 2;
1134 #endif
1135         } else {
1136           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1137           stk_args += 2;
1138         }
1139         break;
1140       case T_VOID: // Halves of longs and doubles
1141         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1142         regs[i].set_bad();
1143         break;
1144       default:
1145         ShouldNotReachHere();
1146         break;
1147       }
1148     }
1149 #ifdef _WIN64
1150   // windows abi requires that we always allocate enough stack space
1151   // for 4 64bit registers to be stored down.
1152   if (stk_args < 8) {
1153     stk_args = 8;
1154   }
1155 #endif // _WIN64
1156 
1157   return stk_args;
1158 }
1159 
1160 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1161                                              uint num_bits,
1162                                              uint total_args_passed) {
1163   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1164          "only certain vector sizes are supported for now");
1165 
1166   static const XMMRegister VEC_ArgReg[32] = {
1167      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1168      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1169     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1170     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1171   };
1172 
1173   uint stk_args = 0;
1174   uint fp_args = 0;
1175 
1176   for (uint i = 0; i < total_args_passed; i++) {
1177     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1178     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1179     regs[i].set_pair(vmreg->next(next_val), vmreg);
1180   }
1181 
1182   return stk_args;
1183 }
1184 
1185 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1186   // We always ignore the frame_slots arg and just use the space just below frame pointer
1187   // which by this time is free to use
1188   switch (ret_type) {
1189   case T_FLOAT:
1190     __ movflt(Address(rbp, -wordSize), xmm0);
1191     break;
1192   case T_DOUBLE:
1193     __ movdbl(Address(rbp, -wordSize), xmm0);
1194     break;
1195   case T_VOID:  break;
1196   default: {
1197     __ movptr(Address(rbp, -wordSize), rax);
1198     }
1199   }
1200 }
1201 
1202 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1203   // We always ignore the frame_slots arg and just use the space just below frame pointer
1204   // which by this time is free to use
1205   switch (ret_type) {
1206   case T_FLOAT:
1207     __ movflt(xmm0, Address(rbp, -wordSize));
1208     break;
1209   case T_DOUBLE:
1210     __ movdbl(xmm0, Address(rbp, -wordSize));
1211     break;
1212   case T_VOID:  break;
1213   default: {
1214     __ movptr(rax, Address(rbp, -wordSize));
1215     }
1216   }
1217 }
1218 
1219 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1220     for ( int i = first_arg ; i < arg_count ; i++ ) {
1221       if (args[i].first()->is_Register()) {
1222         __ push(args[i].first()->as_Register());
1223       } else if (args[i].first()->is_XMMRegister()) {
1224         __ subptr(rsp, 2*wordSize);
1225         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1226       }
1227     }
1228 }
1229 
1230 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1231     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1232       if (args[i].first()->is_Register()) {
1233         __ pop(args[i].first()->as_Register());
1234       } else if (args[i].first()->is_XMMRegister()) {
1235         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1236         __ addptr(rsp, 2*wordSize);
1237       }
1238     }
1239 }
1240 
1241 // Unpack an array argument into a pointer to the body and the length
1242 // if the array is non-null, otherwise pass 0 for both.
1243 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
1244   Register tmp_reg = rax;
1245   assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
1246          "possible collision");
1247   assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
1248          "possible collision");
1249 
1250   __ block_comment("unpack_array_argument {");
1251 
1252   // Pass the length, ptr pair
1253   Label is_null, done;
1254   VMRegPair tmp;
1255   tmp.set_ptr(tmp_reg->as_VMReg());
1256   if (reg.first()->is_stack()) {
1257     // Load the arg up from the stack
1258     __ move_ptr(reg, tmp);
1259     reg = tmp;
1260   }
1261   __ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1262   __ jccb(Assembler::equal, is_null);
1263   __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1264   __ move_ptr(tmp, body_arg);
1265   // load the length relative to the body.
1266   __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() -
1267                            arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1268   __ move32_64(tmp, length_arg);
1269   __ jmpb(done);
1270   __ bind(is_null);
1271   // Pass zeros
1272   __ xorptr(tmp_reg, tmp_reg);
1273   __ move_ptr(tmp, body_arg);
1274   __ move32_64(tmp, length_arg);
1275   __ bind(done);
1276 
1277   __ block_comment("} unpack_array_argument");
1278 }
1279 
1280 
1281 // Different signatures may require very different orders for the move
1282 // to avoid clobbering other arguments.  There's no simple way to
1283 // order them safely.  Compute a safe order for issuing stores and
1284 // break any cycles in those stores.  This code is fairly general but
1285 // it's not necessary on the other platforms so we keep it in the
1286 // platform dependent code instead of moving it into a shared file.
1287 // (See bugs 7013347 & 7145024.)
1288 // Note that this code is specific to LP64.
1289 class ComputeMoveOrder: public StackObj {
1290   class MoveOperation: public ResourceObj {
1291     friend class ComputeMoveOrder;
1292    private:
1293     VMRegPair        _src;
1294     VMRegPair        _dst;
1295     int              _src_index;
1296     int              _dst_index;
1297     bool             _processed;
1298     MoveOperation*  _next;
1299     MoveOperation*  _prev;
1300 
1301     static int get_id(VMRegPair r) {
1302       return r.first()->value();
1303     }
1304 
1305    public:
1306     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1307       _src(src)
1308     , _dst(dst)
1309     , _src_index(src_index)
1310     , _dst_index(dst_index)
1311     , _processed(false)
1312     , _next(NULL)
1313     , _prev(NULL) {
1314     }
1315 
1316     VMRegPair src() const              { return _src; }
1317     int src_id() const                 { return get_id(src()); }
1318     int src_index() const              { return _src_index; }
1319     VMRegPair dst() const              { return _dst; }
1320     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1321     int dst_index() const              { return _dst_index; }
1322     int dst_id() const                 { return get_id(dst()); }
1323     MoveOperation* next() const       { return _next; }
1324     MoveOperation* prev() const       { return _prev; }
1325     void set_processed()               { _processed = true; }
1326     bool is_processed() const          { return _processed; }
1327 
1328     // insert
1329     void break_cycle(VMRegPair temp_register) {
1330       // create a new store following the last store
1331       // to move from the temp_register to the original
1332       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1333 
1334       // break the cycle of links and insert new_store at the end
1335       // break the reverse link.
1336       MoveOperation* p = prev();
1337       assert(p->next() == this, "must be");
1338       _prev = NULL;
1339       p->_next = new_store;
1340       new_store->_prev = p;
1341 
1342       // change the original store to save it's value in the temp.
1343       set_dst(-1, temp_register);
1344     }
1345 
1346     void link(GrowableArray<MoveOperation*>& killer) {
1347       // link this store in front the store that it depends on
1348       MoveOperation* n = killer.at_grow(src_id(), NULL);
1349       if (n != NULL) {
1350         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1351         _next = n;
1352         n->_prev = this;
1353       }
1354     }
1355   };
1356 
1357  private:
1358   GrowableArray<MoveOperation*> edges;
1359 
1360  public:
1361   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1362                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1363     // Move operations where the dest is the stack can all be
1364     // scheduled first since they can't interfere with the other moves.
1365     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1366       if (in_sig_bt[i] == T_ARRAY) {
1367         c_arg--;
1368         if (out_regs[c_arg].first()->is_stack() &&
1369             out_regs[c_arg + 1].first()->is_stack()) {
1370           arg_order.push(i);
1371           arg_order.push(c_arg);
1372         } else {
1373           if (out_regs[c_arg].first()->is_stack() ||
1374               in_regs[i].first() == out_regs[c_arg].first()) {
1375             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1376           } else {
1377             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1378           }
1379         }
1380       } else if (in_sig_bt[i] == T_VOID) {
1381         arg_order.push(i);
1382         arg_order.push(c_arg);
1383       } else {
1384         if (out_regs[c_arg].first()->is_stack() ||
1385             in_regs[i].first() == out_regs[c_arg].first()) {
1386           arg_order.push(i);
1387           arg_order.push(c_arg);
1388         } else {
1389           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1390         }
1391       }
1392     }
1393     // Break any cycles in the register moves and emit the in the
1394     // proper order.
1395     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1396     for (int i = 0; i < stores->length(); i++) {
1397       arg_order.push(stores->at(i)->src_index());
1398       arg_order.push(stores->at(i)->dst_index());
1399     }
1400  }
1401 
1402   // Collected all the move operations
1403   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1404     if (src.first() == dst.first()) return;
1405     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1406   }
1407 
1408   // Walk the edges breaking cycles between moves.  The result list
1409   // can be walked in order to produce the proper set of loads
1410   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1411     // Record which moves kill which values
1412     GrowableArray<MoveOperation*> killer;
1413     for (int i = 0; i < edges.length(); i++) {
1414       MoveOperation* s = edges.at(i);
1415       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1416       killer.at_put_grow(s->dst_id(), s, NULL);
1417     }
1418     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1419            "make sure temp isn't in the registers that are killed");
1420 
1421     // create links between loads and stores
1422     for (int i = 0; i < edges.length(); i++) {
1423       edges.at(i)->link(killer);
1424     }
1425 
1426     // at this point, all the move operations are chained together
1427     // in a doubly linked list.  Processing it backwards finds
1428     // the beginning of the chain, forwards finds the end.  If there's
1429     // a cycle it can be broken at any point,  so pick an edge and walk
1430     // backward until the list ends or we end where we started.
1431     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1432     for (int e = 0; e < edges.length(); e++) {
1433       MoveOperation* s = edges.at(e);
1434       if (!s->is_processed()) {
1435         MoveOperation* start = s;
1436         // search for the beginning of the chain or cycle
1437         while (start->prev() != NULL && start->prev() != s) {
1438           start = start->prev();
1439         }
1440         if (start->prev() == s) {
1441           start->break_cycle(temp_register);
1442         }
1443         // walk the chain forward inserting to store list
1444         while (start != NULL) {
1445           stores->append(start);
1446           start->set_processed();
1447           start = start->next();
1448         }
1449       }
1450     }
1451     return stores;
1452   }
1453 };
1454 
1455 static void verify_oop_args(MacroAssembler* masm,
1456                             const methodHandle& method,
1457                             const BasicType* sig_bt,
1458                             const VMRegPair* regs) {
1459   Register temp_reg = rbx;  // not part of any compiled calling seq
1460   if (VerifyOops) {
1461     for (int i = 0; i < method->size_of_parameters(); i++) {
1462       if (is_reference_type(sig_bt[i])) {
1463         VMReg r = regs[i].first();
1464         assert(r->is_valid(), "bad oop arg");
1465         if (r->is_stack()) {
1466           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1467           __ verify_oop(temp_reg);
1468         } else {
1469           __ verify_oop(r->as_Register());
1470         }
1471       }
1472     }
1473   }
1474 }
1475 
1476 // defined in stubGenerator_x86_64.cpp
1477 OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots);
1478 void fill_continuation_entry(MacroAssembler* masm);
1479 void continuation_enter_cleanup(MacroAssembler* masm);
1480 
1481 // enterSpecial(Continuation c, boolean isContinue)
1482 // On entry: c_rarg1 -- the continuation object
1483 //           c_rarg2 -- isContinue
1484 static void gen_continuation_enter(MacroAssembler* masm,
1485                                  const methodHandle& method,
1486                                  const BasicType* sig_bt,
1487                                  const VMRegPair* regs,
1488                                  int& exception_offset,
1489                                  OopMapSet*oop_maps,
1490                                  int& frame_complete,
1491                                  int& stack_slots) {
1492   //verify_oop_args(masm, method, sig_bt, regs);
1493   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1494                          relocInfo::static_call_type);
1495 
1496   stack_slots = 2; // will be overwritten
1497   address start = __ pc();
1498 
1499   Label call_thaw, exit;
1500 
1501   __ push(rbp);
1502 
1503   //BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1504   //bs->nmethod_entry_barrier(masm);
1505   OopMap* map = continuation_enter_setup(masm, stack_slots);  // kills rax
1506 
1507   // Frame is now completed as far as size and linkage.
1508   frame_complete =__ pc() - start;
1509   // if isContinue == 0
1510   //   _enterSP = sp
1511   // end
1512  
1513   fill_continuation_entry(masm); // kills rax
1514 
1515   __ cmpl(c_rarg2, 0);
1516   __ jcc(Assembler::notEqual, call_thaw);
1517 
1518   int up = align_up((intptr_t) __ pc() + 1, 4) - (intptr_t) (__ pc() + 1);
1519   if (up > 0) {
1520     __ nop(up);
1521   }
1522 
1523   address mark = __ pc();
1524   __ call(resolve);
1525   oop_maps->add_gc_map(__ pc() - start, map);
1526   __ post_call_nop();
1527 
1528   __ jmp(exit);
1529 
1530   __ bind(call_thaw);
1531 
1532   __ movptr(rbx, (intptr_t) StubRoutines::cont_thaw());
1533   __ call(rbx);
1534   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1535   ContinuationEntry::return_pc_offset = __ pc() - start;
1536   __ post_call_nop();
1537 
1538   __ bind(exit);
1539   continuation_enter_cleanup(masm);
1540   __ pop(rbp);
1541   __ ret(0);
1542 
1543   /// exception handling
1544 
1545   exception_offset = __ pc() - start;
1546 
1547   continuation_enter_cleanup(masm);
1548   __ addptr(rsp, 1*wordSize);
1549 
1550   __ movptr(rbx, rax); // save the exception
1551   __ movptr(c_rarg0, Address(rsp, 0));
1552 
1553   __ call_VM_leaf(CAST_FROM_FN_PTR(address,
1554         SharedRuntime::exception_handler_for_return_address),
1555       r15_thread, c_rarg0);
1556   __ mov(rdi, rax);
1557   __ movptr(rax, rbx);
1558   __ mov(rbx, rdi);
1559   __ pop(rdx);
1560 
1561   // continue at exception handler (return address removed)
1562   // rax: exception
1563   // rbx: exception handler
1564   // rdx: throwing pc
1565   __ verify_oop(rax);
1566   __ jmp(rbx);
1567 
1568   CodeBuffer* cbuf = masm->code_section()->outer();
1569   address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, mark);
1570 }
1571 
1572 static void gen_special_dispatch(MacroAssembler* masm,
1573                                  const methodHandle& method,
1574                                  const BasicType* sig_bt,
1575                                  const VMRegPair* regs) {
1576   verify_oop_args(masm, method, sig_bt, regs);
1577   vmIntrinsics::ID iid = method->intrinsic_id();
1578 
1579   // Now write the args into the outgoing interpreter space
1580   bool     has_receiver   = false;
1581   Register receiver_reg   = noreg;
1582   int      member_arg_pos = -1;
1583   Register member_reg     = noreg;
1584   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1585   if (ref_kind != 0) {
1586     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1587     member_reg = rbx;  // known to be free at this point
1588     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1589   } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1590     has_receiver = true;
1591   } else {
1592     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1593   }
1594 
1595   if (member_reg != noreg) {
1596     // Load the member_arg into register, if necessary.
1597     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1598     VMReg r = regs[member_arg_pos].first();
1599     if (r->is_stack()) {
1600       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1601     } else {
1602       // no data motion is needed
1603       member_reg = r->as_Register();
1604     }
1605   }
1606 
1607   if (has_receiver) {
1608     // Make sure the receiver is loaded into a register.
1609     assert(method->size_of_parameters() > 0, "oob");
1610     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1611     VMReg r = regs[0].first();
1612     assert(r->is_valid(), "bad receiver arg");
1613     if (r->is_stack()) {
1614       // Porting note:  This assumes that compiled calling conventions always
1615       // pass the receiver oop in a register.  If this is not true on some
1616       // platform, pick a temp and load the receiver from stack.
1617       fatal("receiver always in a register");
1618       receiver_reg = j_rarg0;  // known to be free at this point
1619       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1620     } else {
1621       // no data motion is needed
1622       receiver_reg = r->as_Register();
1623     }
1624   }
1625 
1626   // Figure out which address we are really jumping to:
1627   MethodHandles::generate_method_handle_dispatch(masm, iid,
1628                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1629 }
1630 
1631 // ---------------------------------------------------------------------------
1632 // Generate a native wrapper for a given method.  The method takes arguments
1633 // in the Java compiled code convention, marshals them to the native
1634 // convention (handlizes oops, etc), transitions to native, makes the call,
1635 // returns to java state (possibly blocking), unhandlizes any result and
1636 // returns.
1637 //
1638 // Critical native functions are a shorthand for the use of
1639 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1640 // functions.  The wrapper is expected to unpack the arguments before
1641 // passing them to the callee. Critical native functions leave the state _in_Java,
1642 // since they cannot stop for GC.
1643 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1644 // block and the check for pending exceptions it's impossible for them
1645 // to be thrown.
1646 //
1647 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1648                                                 const methodHandle& method,
1649                                                 int compile_id,
1650                                                 BasicType* in_sig_bt,
1651                                                 VMRegPair* in_regs,
1652                                                 BasicType ret_type,
1653                                                 address critical_entry) {
1654   if (method->is_continuation_enter_intrinsic()) {
1655     vmIntrinsics::ID iid = method->intrinsic_id();
1656     intptr_t start = (intptr_t)__ pc();
1657     int vep_offset = ((intptr_t)__ pc()) - start;
1658     int exception_offset = 0;
1659     int frame_complete = 0;
1660     int stack_slots = 0;
1661     OopMapSet* oop_maps =  new OopMapSet();
1662     gen_continuation_enter(masm,
1663                          method,
1664                          in_sig_bt,
1665                          in_regs,
1666                          exception_offset,
1667                          oop_maps,
1668                          frame_complete,
1669                          stack_slots);
1670     __ flush();
1671     nmethod* nm = nmethod::new_native_nmethod(method,
1672                                               compile_id,
1673                                               masm->code(),
1674                                               vep_offset,
1675                                               frame_complete,
1676                                               stack_slots,
1677                                               in_ByteSize(-1),
1678                                               in_ByteSize(-1),
1679                                               oop_maps,
1680                                               exception_offset);
1681     ContinuationEntry::set_enter_nmethod(nm);
1682     return nm;
1683   }
1684 
1685   if (method->is_method_handle_intrinsic()) {
1686     vmIntrinsics::ID iid = method->intrinsic_id();
1687     intptr_t start = (intptr_t)__ pc();
1688     int vep_offset = ((intptr_t)__ pc()) - start;
1689     gen_special_dispatch(masm,
1690                          method,
1691                          in_sig_bt,
1692                          in_regs);
1693     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1694     __ flush();
1695     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1696     return nmethod::new_native_nmethod(method,
1697                                        compile_id,
1698                                        masm->code(),
1699                                        vep_offset,
1700                                        frame_complete,
1701                                        stack_slots / VMRegImpl::slots_per_word,
1702                                        in_ByteSize(-1),
1703                                        in_ByteSize(-1),
1704                                        (OopMapSet*)NULL);
1705   }
1706 
1707   bool is_critical_native = true;
1708   address native_func = critical_entry;
1709   if (native_func == NULL) {
1710     native_func = method->native_function();
1711     is_critical_native = false;
1712   }
1713   assert(native_func != NULL, "must have function");
1714 
1715   // An OopMap for lock (and class if static)
1716   OopMapSet *oop_maps = new OopMapSet();
1717   intptr_t start = (intptr_t)__ pc();
1718 
1719   // We have received a description of where all the java arg are located
1720   // on entry to the wrapper. We need to convert these args to where
1721   // the jni function will expect them. To figure out where they go
1722   // we convert the java signature to a C signature by inserting
1723   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1724 
1725   const int total_in_args = method->size_of_parameters();
1726   int total_c_args = total_in_args;
1727   if (!is_critical_native) {
1728     total_c_args += 1;
1729     if (method->is_static()) {
1730       total_c_args++;
1731     }
1732   } else {
1733     for (int i = 0; i < total_in_args; i++) {
1734       if (in_sig_bt[i] == T_ARRAY) {
1735         total_c_args++;
1736       }
1737     }
1738   }
1739 
1740   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1741   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1742   BasicType* in_elem_bt = NULL;
1743 
1744   int argc = 0;
1745   if (!is_critical_native) {
1746     out_sig_bt[argc++] = T_ADDRESS;
1747     if (method->is_static()) {
1748       out_sig_bt[argc++] = T_OBJECT;
1749     }
1750 
1751     for (int i = 0; i < total_in_args ; i++ ) {
1752       out_sig_bt[argc++] = in_sig_bt[i];
1753     }
1754   } else {
1755     in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
1756     SignatureStream ss(method->signature());
1757     for (int i = 0; i < total_in_args ; i++ ) {
1758       if (in_sig_bt[i] == T_ARRAY) {
1759         // Arrays are passed as int, elem* pair
1760         out_sig_bt[argc++] = T_INT;
1761         out_sig_bt[argc++] = T_ADDRESS;
1762         ss.skip_array_prefix(1);  // skip one '['
1763         assert(ss.is_primitive(), "primitive type expected");
1764         in_elem_bt[i] = ss.type();
1765       } else {
1766         out_sig_bt[argc++] = in_sig_bt[i];
1767         in_elem_bt[i] = T_VOID;
1768       }
1769       if (in_sig_bt[i] != T_VOID) {
1770         assert(in_sig_bt[i] == ss.type() ||
1771                in_sig_bt[i] == T_ARRAY, "must match");
1772         ss.next();
1773       }
1774     }
1775   }
1776 
1777   // Now figure out where the args must be stored and how much stack space
1778   // they require.
1779   int out_arg_slots;
1780   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1781 
1782   // Compute framesize for the wrapper.  We need to handlize all oops in
1783   // incoming registers
1784 
1785   // Calculate the total number of stack slots we will need.
1786 
1787   // First count the abi requirement plus all of the outgoing args
1788   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1789 
1790   // Now the space for the inbound oop handle area
1791   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1792   if (is_critical_native) {
1793     // Critical natives may have to call out so they need a save area
1794     // for register arguments.
1795     int double_slots = 0;
1796     int single_slots = 0;
1797     for ( int i = 0; i < total_in_args; i++) {
1798       if (in_regs[i].first()->is_Register()) {
1799         const Register reg = in_regs[i].first()->as_Register();
1800         switch (in_sig_bt[i]) {
1801           case T_BOOLEAN:
1802           case T_BYTE:
1803           case T_SHORT:
1804           case T_CHAR:
1805           case T_INT:  single_slots++; break;
1806           case T_ARRAY:  // specific to LP64 (7145024)
1807           case T_LONG: double_slots++; break;
1808           default:  ShouldNotReachHere();
1809         }
1810       } else if (in_regs[i].first()->is_XMMRegister()) {
1811         switch (in_sig_bt[i]) {
1812           case T_FLOAT:  single_slots++; break;
1813           case T_DOUBLE: double_slots++; break;
1814           default:  ShouldNotReachHere();
1815         }
1816       } else if (in_regs[i].first()->is_FloatRegister()) {
1817         ShouldNotReachHere();
1818       }
1819     }
1820     total_save_slots = double_slots * 2 + single_slots;
1821     // align the save area
1822     if (double_slots != 0) {
1823       stack_slots = align_up(stack_slots, 2);
1824     }
1825   }
1826 
1827   int oop_handle_offset = stack_slots;
1828   stack_slots += total_save_slots;
1829 
1830   // Now any space we need for handlizing a klass if static method
1831 
1832   int klass_slot_offset = 0;
1833   int klass_offset = -1;
1834   int lock_slot_offset = 0;
1835   bool is_static = false;
1836 
1837   if (method->is_static()) {
1838     klass_slot_offset = stack_slots;
1839     stack_slots += VMRegImpl::slots_per_word;
1840     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1841     is_static = true;
1842   }
1843 
1844   // Plus a lock if needed
1845 
1846   if (method->is_synchronized()) {
1847     lock_slot_offset = stack_slots;
1848     stack_slots += VMRegImpl::slots_per_word;
1849   }
1850 
1851   // Now a place (+2) to save return values or temp during shuffling
1852   // + 4 for return address (which we own) and saved rbp
1853   stack_slots += 6;
1854 
1855   // Ok The space we have allocated will look like:
1856   //
1857   //
1858   // FP-> |                     |
1859   //      |---------------------|
1860   //      | 2 slots for moves   |
1861   //      |---------------------|
1862   //      | lock box (if sync)  |
1863   //      |---------------------| <- lock_slot_offset
1864   //      | klass (if static)   |
1865   //      |---------------------| <- klass_slot_offset
1866   //      | oopHandle area      |
1867   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1868   //      | outbound memory     |
1869   //      | based arguments     |
1870   //      |                     |
1871   //      |---------------------|
1872   //      |                     |
1873   // SP-> | out_preserved_slots |
1874   //
1875   //
1876 
1877 
1878   // Now compute actual number of stack words we need rounding to make
1879   // stack properly aligned.
1880   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1881 
1882   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1883 
1884   // First thing make an ic check to see if we should even be here
1885 
1886   // We are free to use all registers as temps without saving them and
1887   // restoring them except rbp. rbp is the only callee save register
1888   // as far as the interpreter and the compiler(s) are concerned.
1889 
1890 
1891   const Register ic_reg = rax;
1892   const Register receiver = j_rarg0;
1893 
1894   Label hit;
1895   Label exception_pending;
1896 
1897   assert_different_registers(ic_reg, receiver, rscratch1);
1898   __ verify_oop(receiver);
1899   __ load_klass(rscratch1, receiver, rscratch2);
1900   __ cmpq(ic_reg, rscratch1);
1901   __ jcc(Assembler::equal, hit);
1902 
1903   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1904 
1905   // Verified entry point must be aligned
1906   __ align(8);
1907 
1908   __ bind(hit);
1909 
1910   int vep_offset = ((intptr_t)__ pc()) - start;
1911 
1912   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1913     Label L_skip_barrier;
1914     Register klass = r10;
1915     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1916     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1917 
1918     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1919 
1920     __ bind(L_skip_barrier);
1921   }
1922 
1923 #ifdef COMPILER1
1924   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1925   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1926     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1927   }
1928 #endif // COMPILER1
1929 
1930   // The instruction at the verified entry point must be 5 bytes or longer
1931   // because it can be patched on the fly by make_non_entrant. The stack bang
1932   // instruction fits that requirement.
1933 
1934   // Generate stack overflow check
1935   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1936 
1937   // Generate a new frame for the wrapper.
1938   __ enter();
1939   // -2 because return address is already present and so is saved rbp
1940   __ subptr(rsp, stack_size - 2*wordSize);
1941 
1942   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1943   bs->nmethod_entry_barrier(masm);
1944 
1945   // Frame is now completed as far as size and linkage.
1946   int frame_complete = ((intptr_t)__ pc()) - start;
1947 
1948     if (UseRTMLocking) {
1949       // Abort RTM transaction before calling JNI
1950       // because critical section will be large and will be
1951       // aborted anyway. Also nmethod could be deoptimized.
1952       __ xabort(0);
1953     }
1954 
1955 #ifdef ASSERT
1956     {
1957       Label L;
1958       __ mov(rax, rsp);
1959       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1960       __ cmpptr(rax, rsp);
1961       __ jcc(Assembler::equal, L);
1962       __ stop("improperly aligned stack");
1963       __ bind(L);
1964     }
1965 #endif /* ASSERT */
1966 
1967 
1968   // We use r14 as the oop handle for the receiver/klass
1969   // It is callee save so it survives the call to native
1970 
1971   const Register oop_handle_reg = r14;
1972 
1973   //
1974   // We immediately shuffle the arguments so that any vm call we have to
1975   // make from here on out (sync slow path, jvmti, etc.) we will have
1976   // captured the oops from our caller and have a valid oopMap for
1977   // them.
1978 
1979   // -----------------
1980   // The Grand Shuffle
1981 
1982   // The Java calling convention is either equal (linux) or denser (win64) than the
1983   // c calling convention. However the because of the jni_env argument the c calling
1984   // convention always has at least one more (and two for static) arguments than Java.
1985   // Therefore if we move the args from java -> c backwards then we will never have
1986   // a register->register conflict and we don't have to build a dependency graph
1987   // and figure out how to break any cycles.
1988   //
1989 
1990   // Record esp-based slot for receiver on stack for non-static methods
1991   int receiver_offset = -1;
1992 
1993   // This is a trick. We double the stack slots so we can claim
1994   // the oops in the caller's frame. Since we are sure to have
1995   // more args than the caller doubling is enough to make
1996   // sure we can capture all the incoming oop args from the
1997   // caller.
1998   //
1999   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2000 
2001   // Mark location of rbp (someday)
2002   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2003 
2004   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2005   // All inbound args are referenced based on rbp and all outbound args via rsp.
2006 
2007 
2008 #ifdef ASSERT
2009   bool reg_destroyed[RegisterImpl::number_of_registers];
2010   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
2011   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
2012     reg_destroyed[r] = false;
2013   }
2014   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
2015     freg_destroyed[f] = false;
2016   }
2017 
2018 #endif /* ASSERT */
2019 
2020   // This may iterate in two different directions depending on the
2021   // kind of native it is.  The reason is that for regular JNI natives
2022   // the incoming and outgoing registers are offset upwards and for
2023   // critical natives they are offset down.
2024   GrowableArray<int> arg_order(2 * total_in_args);
2025 
2026   VMRegPair tmp_vmreg;
2027   tmp_vmreg.set2(rbx->as_VMReg());
2028 
2029   if (!is_critical_native) {
2030     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2031       arg_order.push(i);
2032       arg_order.push(c_arg);
2033     }
2034   } else {
2035     // Compute a valid move order, using tmp_vmreg to break any cycles
2036     ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
2037   }
2038 
2039   int temploc = -1;
2040   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2041     int i = arg_order.at(ai);
2042     int c_arg = arg_order.at(ai + 1);
2043     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2044     if (c_arg == -1) {
2045       assert(is_critical_native, "should only be required for critical natives");
2046       // This arg needs to be moved to a temporary
2047       __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
2048       in_regs[i] = tmp_vmreg;
2049       temploc = i;
2050       continue;
2051     } else if (i == -1) {
2052       assert(is_critical_native, "should only be required for critical natives");
2053       // Read from the temporary location
2054       assert(temploc != -1, "must be valid");
2055       i = temploc;
2056       temploc = -1;
2057     }
2058 #ifdef ASSERT
2059     if (in_regs[i].first()->is_Register()) {
2060       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2061     } else if (in_regs[i].first()->is_XMMRegister()) {
2062       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2063     }
2064     if (out_regs[c_arg].first()->is_Register()) {
2065       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2066     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2067       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2068     }
2069 #endif /* ASSERT */
2070     switch (in_sig_bt[i]) {
2071       case T_ARRAY:
2072         if (is_critical_native) {
2073           unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
2074           c_arg++;
2075 #ifdef ASSERT
2076           if (out_regs[c_arg].first()->is_Register()) {
2077             reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2078           } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2079             freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2080           }
2081 #endif
2082           break;
2083         }
2084       case T_OBJECT:
2085         assert(!is_critical_native, "no oop arguments");
2086         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2087                     ((i == 0) && (!is_static)),
2088                     &receiver_offset);
2089         break;
2090       case T_VOID:
2091         break;
2092 
2093       case T_FLOAT:
2094         __ float_move(in_regs[i], out_regs[c_arg]);
2095           break;
2096 
2097       case T_DOUBLE:
2098         assert( i + 1 < total_in_args &&
2099                 in_sig_bt[i + 1] == T_VOID &&
2100                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2101         __ double_move(in_regs[i], out_regs[c_arg]);
2102         break;
2103 
2104       case T_LONG :
2105         __ long_move(in_regs[i], out_regs[c_arg]);
2106         break;
2107 
2108       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2109 
2110       default:
2111         __ move32_64(in_regs[i], out_regs[c_arg]);
2112     }
2113   }
2114 
2115   int c_arg;
2116 
2117   // Pre-load a static method's oop into r14.  Used both by locking code and
2118   // the normal JNI call code.
2119   if (!is_critical_native) {
2120     // point c_arg at the first arg that is already loaded in case we
2121     // need to spill before we call out
2122     c_arg = total_c_args - total_in_args;
2123 
2124     if (method->is_static()) {
2125 
2126       //  load oop into a register
2127       __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2128 
2129       // Now handlize the static class mirror it's known not-null.
2130       __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2131       map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2132 
2133       // Now get the handle
2134       __ lea(oop_handle_reg, Address(rsp, klass_offset));
2135       // store the klass handle as second argument
2136       __ movptr(c_rarg1, oop_handle_reg);
2137       // and protect the arg if we must spill
2138       c_arg--;
2139     }
2140   } else {
2141     // For JNI critical methods we need to save all registers in save_args.
2142     c_arg = 0;
2143   }
2144 
2145   // Change state to native (we save the return address in the thread, since it might not
2146   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2147   // points into the right code segment. It does not have to be the correct return pc.
2148   // We use the same pc/oopMap repeatedly when we call out
2149 
2150   intptr_t the_pc = (intptr_t) __ pc();
2151   oop_maps->add_gc_map(the_pc - start, map);
2152 
2153   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
2154 
2155 
2156   // We have all of the arguments setup at this point. We must not touch any register
2157   // argument registers at this point (what if we save/restore them there are no oop?
2158 
2159   {
2160     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2161     // protect the args we've loaded
2162     save_args(masm, total_c_args, c_arg, out_regs);
2163     __ mov_metadata(c_rarg1, method());
2164     __ call_VM_leaf(
2165       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2166       r15_thread, c_rarg1);
2167     restore_args(masm, total_c_args, c_arg, out_regs);
2168   }
2169 
2170   // RedefineClasses() tracing support for obsolete method entry
2171   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2172     // protect the args we've loaded
2173     save_args(masm, total_c_args, c_arg, out_regs);
2174     __ mov_metadata(c_rarg1, method());
2175     __ call_VM_leaf(
2176       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2177       r15_thread, c_rarg1);
2178     restore_args(masm, total_c_args, c_arg, out_regs);
2179   }
2180 
2181   // Lock a synchronized method
2182 
2183   // Register definitions used by locking and unlocking
2184 
2185   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2186   const Register obj_reg  = rbx;  // Will contain the oop
2187   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2188   const Register old_hdr  = r13;  // value of old header at unlock time
2189 
2190   Label slow_path_lock;
2191   Label lock_done;
2192 
2193   if (method->is_synchronized()) {
2194     assert(!is_critical_native, "unhandled");
2195 
2196 
2197     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2198 
2199     // Get the handle (the 2nd argument)
2200     __ mov(oop_handle_reg, c_rarg1);
2201 
2202     // Get address of the box
2203 
2204     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2205 
2206     // Load the oop from the handle
2207     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2208 
2209     // Load immediate 1 into swap_reg %rax
2210     __ movl(swap_reg, 1);
2211 
2212     // Load (object->mark() | 1) into swap_reg %rax
2213     __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2214 
2215     // Save (object->mark() | 1) into BasicLock's displaced header
2216     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2217 
2218     // src -> dest iff dest == rax else rax <- dest
2219     __ lock();
2220     __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2221     __ jcc(Assembler::equal, lock_done);
2222 
2223     // Hmm should this move to the slow path code area???
2224 
2225     // Test if the oopMark is an obvious stack pointer, i.e.,
2226     //  1) (mark & 3) == 0, and
2227     //  2) rsp <= mark < mark + os::pagesize()
2228     // These 3 tests can be done by evaluating the following
2229     // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2230     // assuming both stack pointer and pagesize have their
2231     // least significant 2 bits clear.
2232     // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2233 
2234     __ subptr(swap_reg, rsp);
2235     __ andptr(swap_reg, 3 - os::vm_page_size());
2236 
2237     // Save the test result, for recursive case, the result is zero
2238     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2239     __ jcc(Assembler::notEqual, slow_path_lock);
2240 
2241     // Slow path will re-enter here
2242     __ bind(lock_done);
2243     // __ inc_held_monitor_count(r15_thread);
2244   }
2245 
2246   // Finally just about ready to make the JNI call
2247 
2248   // get JNIEnv* which is first argument to native
2249   if (!is_critical_native) {
2250     __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2251 
2252     // Now set thread in native
2253     __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2254   }
2255 
2256   __ call(RuntimeAddress(native_func));
2257 
2258   // Verify or restore cpu control state after JNI call
2259   __ restore_cpu_control_state_after_jni();
2260 
2261   // Unpack native results.
2262   switch (ret_type) {
2263   case T_BOOLEAN: __ c2bool(rax);            break;
2264   case T_CHAR   : __ movzwl(rax, rax);      break;
2265   case T_BYTE   : __ sign_extend_byte (rax); break;
2266   case T_SHORT  : __ sign_extend_short(rax); break;
2267   case T_INT    : /* nothing to do */        break;
2268   case T_DOUBLE :
2269   case T_FLOAT  :
2270     // Result is in xmm0 we'll save as needed
2271     break;
2272   case T_ARRAY:                 // Really a handle
2273   case T_OBJECT:                // Really a handle
2274       break; // can't de-handlize until after safepoint check
2275   case T_VOID: break;
2276   case T_LONG: break;
2277   default       : ShouldNotReachHere();
2278   }
2279 
2280   Label after_transition;
2281 
2282   // If this is a critical native, check for a safepoint or suspend request after the call.
2283   // If a safepoint is needed, transition to native, then to native_trans to handle
2284   // safepoints like the native methods that are not critical natives.
2285   if (is_critical_native) {
2286     Label needs_safepoint;
2287     __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */);
2288     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2289     __ jcc(Assembler::equal, after_transition);
2290     __ bind(needs_safepoint);
2291   }
2292 
2293   // Switch thread to "native transition" state before reading the synchronization state.
2294   // This additional state is necessary because reading and testing the synchronization
2295   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2296   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2297   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2298   //     Thread A is resumed to finish this native method, but doesn't block here since it
2299   //     didn't see any synchronization is progress, and escapes.
2300   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2301 
2302   // Force this write out before the read below
2303   __ membar(Assembler::Membar_mask_bits(
2304               Assembler::LoadLoad | Assembler::LoadStore |
2305               Assembler::StoreLoad | Assembler::StoreStore));
2306 
2307   // check for safepoint operation in progress and/or pending suspend requests
2308   {
2309     Label Continue;
2310     Label slow_path;
2311 
2312     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2313 
2314     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2315     __ jcc(Assembler::equal, Continue);
2316     __ bind(slow_path);
2317 
2318     // Don't use call_VM as it will see a possible pending exception and forward it
2319     // and never return here preventing us from clearing _last_native_pc down below.
2320     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2321     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2322     // by hand.
2323     //
2324     __ vzeroupper();
2325     save_native_result(masm, ret_type, stack_slots);
2326     __ mov(c_rarg0, r15_thread);
2327     __ mov(r12, rsp); // remember sp
2328     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2329     __ andptr(rsp, -16); // align stack as required by ABI
2330     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2331     __ mov(rsp, r12); // restore sp
2332     __ reinit_heapbase();
2333     // Restore any method result value
2334     restore_native_result(masm, ret_type, stack_slots);
2335     __ bind(Continue);
2336   }
2337 
2338   // change thread state
2339   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2340   __ bind(after_transition);
2341 
2342   Label reguard;
2343   Label reguard_done;
2344   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2345   __ jcc(Assembler::equal, reguard);
2346   __ bind(reguard_done);
2347 
2348   // native result if any is live
2349 
2350   // Unlock
2351   Label unlock_done;
2352   Label slow_path_unlock;
2353   if (method->is_synchronized()) {
2354 
2355     // Get locked oop from the handle we passed to jni
2356     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2357 
2358     Label done;
2359     // Simple recursive lock?
2360 
2361     __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2362     __ jcc(Assembler::equal, done);
2363 
2364     // Must save rax if if it is live now because cmpxchg must use it
2365     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2366       save_native_result(masm, ret_type, stack_slots);
2367     }
2368 
2369 
2370     // get address of the stack lock
2371     __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2372     //  get old displaced header
2373     __ movptr(old_hdr, Address(rax, 0));
2374 
2375     // Atomic swap old header if oop still contains the stack lock
2376     __ lock();
2377     __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2378     __ jcc(Assembler::notEqual, slow_path_unlock);
2379 
2380     // slow path re-enters here
2381     __ bind(unlock_done);
2382     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2383       restore_native_result(masm, ret_type, stack_slots);
2384     }
2385 
2386     __ bind(done);
2387     // __ dec_held_monitor_count(r15_thread);
2388   }
2389   {
2390     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2391     save_native_result(masm, ret_type, stack_slots);
2392     __ mov_metadata(c_rarg1, method());
2393     __ call_VM_leaf(
2394          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2395          r15_thread, c_rarg1);
2396     restore_native_result(masm, ret_type, stack_slots);
2397   }
2398 
2399   __ reset_last_Java_frame(false);
2400 
2401   // Unbox oop result, e.g. JNIHandles::resolve value.
2402   if (is_reference_type(ret_type)) {
2403     __ resolve_jobject(rax /* value */,
2404                        r15_thread /* thread */,
2405                        rcx /* tmp */);
2406   }
2407 
2408   if (CheckJNICalls) {
2409     // clear_pending_jni_exception_check
2410     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2411   }
2412 
2413   if (!is_critical_native) {
2414     // reset handle block
2415     __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2416     __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2417   }
2418 
2419   // pop our frame
2420 
2421   __ leave();
2422 
2423   if (!is_critical_native) {
2424     // Any exception pending?
2425     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2426     __ jcc(Assembler::notEqual, exception_pending);
2427   }
2428 
2429   // Return
2430 
2431   __ ret(0);
2432 
2433   // Unexpected paths are out of line and go here
2434 
2435   if (!is_critical_native) {
2436     // forward the exception
2437     __ bind(exception_pending);
2438 
2439     // and forward the exception
2440     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2441   }
2442 
2443   // Slow path locking & unlocking
2444   if (method->is_synchronized()) {
2445 
2446     // BEGIN Slow path lock
2447     __ bind(slow_path_lock);
2448 
2449     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2450     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2451 
2452     // protect the args we've loaded
2453     save_args(masm, total_c_args, c_arg, out_regs);
2454 
2455     __ mov(c_rarg0, obj_reg);
2456     __ mov(c_rarg1, lock_reg);
2457     __ mov(c_rarg2, r15_thread);
2458 
2459     // Not a leaf but we have last_Java_frame setup as we want
2460     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2461     restore_args(masm, total_c_args, c_arg, out_regs);
2462 
2463 #ifdef ASSERT
2464     { Label L;
2465     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2466     __ jcc(Assembler::equal, L);
2467     __ stop("no pending exception allowed on exit from monitorenter");
2468     __ bind(L);
2469     }
2470 #endif
2471     __ jmp(lock_done);
2472 
2473     // END Slow path lock
2474 
2475     // BEGIN Slow path unlock
2476     __ bind(slow_path_unlock);
2477 
2478     // If we haven't already saved the native result we must save it now as xmm registers
2479     // are still exposed.
2480     __ vzeroupper();
2481     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2482       save_native_result(masm, ret_type, stack_slots);
2483     }
2484 
2485     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2486 
2487     __ mov(c_rarg0, obj_reg);
2488     __ mov(c_rarg2, r15_thread);
2489     __ mov(r12, rsp); // remember sp
2490     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2491     __ andptr(rsp, -16); // align stack as required by ABI
2492 
2493     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2494     // NOTE that obj_reg == rbx currently
2495     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2496     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2497 
2498     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2499     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2500     __ mov(rsp, r12); // restore sp
2501     __ reinit_heapbase();
2502 #ifdef ASSERT
2503     {
2504       Label L;
2505       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2506       __ jcc(Assembler::equal, L);
2507       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2508       __ bind(L);
2509     }
2510 #endif /* ASSERT */
2511 
2512     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2513 
2514     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2515       restore_native_result(masm, ret_type, stack_slots);
2516     }
2517     __ jmp(unlock_done);
2518 
2519     // END Slow path unlock
2520 
2521   } // synchronized
2522 
2523   // SLOW PATH Reguard the stack if needed
2524 
2525   __ bind(reguard);
2526   __ vzeroupper();
2527   save_native_result(masm, ret_type, stack_slots);
2528   __ mov(r12, rsp); // remember sp
2529   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2530   __ andptr(rsp, -16); // align stack as required by ABI
2531   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2532   __ mov(rsp, r12); // restore sp
2533   __ reinit_heapbase();
2534   restore_native_result(masm, ret_type, stack_slots);
2535   // and continue
2536   __ jmp(reguard_done);
2537 
2538 
2539 
2540   __ flush();
2541 
2542   nmethod *nm = nmethod::new_native_nmethod(method,
2543                                             compile_id,
2544                                             masm->code(),
2545                                             vep_offset,
2546                                             frame_complete,
2547                                             stack_slots / VMRegImpl::slots_per_word,
2548                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2549                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2550                                             oop_maps);
2551 
2552   return nm;
2553 }
2554 
2555 // this function returns the adjust size (in number of words) to a c2i adapter
2556 // activation for use during deoptimization
2557 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2558   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2559 }
2560 
2561 
2562 uint SharedRuntime::out_preserve_stack_slots() {
2563   return 0;
2564 }
2565 
2566 
2567 // Number of stack slots between incoming argument block and the start of
2568 // a new frame.  The PROLOG must add this many slots to the stack.  The
2569 // EPILOG must remove this many slots.  amd64 needs two slots for
2570 // return address.
2571 uint SharedRuntime::in_preserve_stack_slots() {
2572   return 4 + 2 * VerifyStackAtCalls;
2573 }
2574 
2575 //------------------------------generate_deopt_blob----------------------------
2576 void SharedRuntime::generate_deopt_blob() {
2577   // Allocate space for the code
2578   ResourceMark rm;
2579   // Setup code generation tools
2580   int pad = 0;
2581   if (UseAVX > 2) {
2582     pad += 1024;
2583   }
2584 #if INCLUDE_JVMCI
2585   if (EnableJVMCI) {
2586     pad += 512; // Increase the buffer size when compiling for JVMCI
2587   }
2588 #endif
2589   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2590   MacroAssembler* masm = new MacroAssembler(&buffer);
2591   int frame_size_in_words;
2592   OopMap* map = NULL;
2593   OopMapSet *oop_maps = new OopMapSet();
2594 
2595   // -------------
2596   // This code enters when returning to a de-optimized nmethod.  A return
2597   // address has been pushed on the the stack, and return values are in
2598   // registers.
2599   // If we are doing a normal deopt then we were called from the patched
2600   // nmethod from the point we returned to the nmethod. So the return
2601   // address on the stack is wrong by NativeCall::instruction_size
2602   // We will adjust the value so it looks like we have the original return
2603   // address on the stack (like when we eagerly deoptimized).
2604   // In the case of an exception pending when deoptimizing, we enter
2605   // with a return address on the stack that points after the call we patched
2606   // into the exception handler. We have the following register state from,
2607   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2608   //    rax: exception oop
2609   //    rbx: exception handler
2610   //    rdx: throwing pc
2611   // So in this case we simply jam rdx into the useless return address and
2612   // the stack looks just like we want.
2613   //
2614   // At this point we need to de-opt.  We save the argument return
2615   // registers.  We call the first C routine, fetch_unroll_info().  This
2616   // routine captures the return values and returns a structure which
2617   // describes the current frame size and the sizes of all replacement frames.
2618   // The current frame is compiled code and may contain many inlined
2619   // functions, each with their own JVM state.  We pop the current frame, then
2620   // push all the new frames.  Then we call the C routine unpack_frames() to
2621   // populate these frames.  Finally unpack_frames() returns us the new target
2622   // address.  Notice that callee-save registers are BLOWN here; they have
2623   // already been captured in the vframeArray at the time the return PC was
2624   // patched.
2625   address start = __ pc();
2626   Label cont;
2627 
2628   // Prolog for non exception case!
2629 
2630   // Save everything in sight.
2631   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2632 
2633   // Normal deoptimization.  Save exec mode for unpack_frames.
2634   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2635   __ jmp(cont);
2636 
2637   int reexecute_offset = __ pc() - start;
2638 #if INCLUDE_JVMCI && !defined(COMPILER1)
2639   if (EnableJVMCI && UseJVMCICompiler) {
2640     // JVMCI does not use this kind of deoptimization
2641     __ should_not_reach_here();
2642   }
2643 #endif
2644 
2645   // Reexecute case
2646   // return address is the pc describes what bci to do re-execute at
2647 
2648   // No need to update map as each call to save_live_registers will produce identical oopmap
2649   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2650 
2651   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2652   __ jmp(cont);
2653 
2654 #if INCLUDE_JVMCI
2655   Label after_fetch_unroll_info_call;
2656   int implicit_exception_uncommon_trap_offset = 0;
2657   int uncommon_trap_offset = 0;
2658 
2659   if (EnableJVMCI) {
2660     implicit_exception_uncommon_trap_offset = __ pc() - start;
2661 
2662     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2663     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2664 
2665     uncommon_trap_offset = __ pc() - start;
2666 
2667     // Save everything in sight.
2668     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2669     // fetch_unroll_info needs to call last_java_frame()
2670     __ set_last_Java_frame(noreg, noreg, NULL);
2671 
2672     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2673     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2674 
2675     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2676     __ mov(c_rarg0, r15_thread);
2677     __ movl(c_rarg2, r14); // exec mode
2678     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2679     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2680 
2681     __ reset_last_Java_frame(false);
2682 
2683     __ jmp(after_fetch_unroll_info_call);
2684   } // EnableJVMCI
2685 #endif // INCLUDE_JVMCI
2686 
2687   int exception_offset = __ pc() - start;
2688 
2689   // Prolog for exception case
2690 
2691   // all registers are dead at this entry point, except for rax, and
2692   // rdx which contain the exception oop and exception pc
2693   // respectively.  Set them in TLS and fall thru to the
2694   // unpack_with_exception_in_tls entry point.
2695 
2696   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2697   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2698 
2699   int exception_in_tls_offset = __ pc() - start;
2700 
2701   // new implementation because exception oop is now passed in JavaThread
2702 
2703   // Prolog for exception case
2704   // All registers must be preserved because they might be used by LinearScan
2705   // Exceptiop oop and throwing PC are passed in JavaThread
2706   // tos: stack at point of call to method that threw the exception (i.e. only
2707   // args are on the stack, no return address)
2708 
2709   // make room on stack for the return address
2710   // It will be patched later with the throwing pc. The correct value is not
2711   // available now because loading it from memory would destroy registers.
2712   __ push(0);
2713 
2714   // Save everything in sight.
2715   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2716 
2717   // Now it is safe to overwrite any register
2718 
2719   // Deopt during an exception.  Save exec mode for unpack_frames.
2720   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2721 
2722   // load throwing pc from JavaThread and patch it as the return address
2723   // of the current frame. Then clear the field in JavaThread
2724 
2725   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2726   __ movptr(Address(rbp, wordSize), rdx);
2727   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2728 
2729 #ifdef ASSERT
2730   // verify that there is really an exception oop in JavaThread
2731   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2732   __ verify_oop(rax);
2733 
2734   // verify that there is no pending exception
2735   Label no_pending_exception;
2736   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2737   __ testptr(rax, rax);
2738   __ jcc(Assembler::zero, no_pending_exception);
2739   __ stop("must not have pending exception here");
2740   __ bind(no_pending_exception);
2741 #endif
2742 
2743   __ bind(cont);
2744 
2745   // Call C code.  Need thread and this frame, but NOT official VM entry
2746   // crud.  We cannot block on this call, no GC can happen.
2747   //
2748   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2749 
2750   // fetch_unroll_info needs to call last_java_frame().
2751 
2752   __ set_last_Java_frame(noreg, noreg, NULL);
2753 #ifdef ASSERT
2754   { Label L;
2755     __ cmpptr(Address(r15_thread,
2756                     JavaThread::last_Java_fp_offset()),
2757             (int32_t)0);
2758     __ jcc(Assembler::equal, L);
2759     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2760     __ bind(L);
2761   }
2762 #endif // ASSERT
2763   __ mov(c_rarg0, r15_thread);
2764   __ movl(c_rarg1, r14); // exec_mode
2765   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2766 
2767   // Need to have an oopmap that tells fetch_unroll_info where to
2768   // find any register it might need.
2769   oop_maps->add_gc_map(__ pc() - start, map);
2770 
2771   __ reset_last_Java_frame(false);
2772 
2773 #if INCLUDE_JVMCI
2774   if (EnableJVMCI) {
2775     __ bind(after_fetch_unroll_info_call);
2776   }
2777 #endif
2778 
2779   // Load UnrollBlock* into rdi
2780   __ mov(rdi, rax);
2781 
2782   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2783    Label noException;
2784   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2785   __ jcc(Assembler::notEqual, noException);
2786   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2787   // QQQ this is useless it was NULL above
2788   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2789   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2790   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2791 
2792   __ verify_oop(rax);
2793 
2794   // Overwrite the result registers with the exception results.
2795   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2796   // I think this is useless
2797   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2798 
2799   __ bind(noException);
2800 
2801   // Only register save data is on the stack.
2802   // Now restore the result registers.  Everything else is either dead
2803   // or captured in the vframeArray.
2804   RegisterSaver::restore_result_registers(masm);
2805 
2806   // All of the register save area has been popped of the stack. Only the
2807   // return address remains.
2808 
2809   // Pop all the frames we must move/replace.
2810   //
2811   // Frame picture (youngest to oldest)
2812   // 1: self-frame (no frame link)
2813   // 2: deopting frame  (no frame link)
2814   // 3: caller of deopting frame (could be compiled/interpreted).
2815   //
2816   // Note: by leaving the return address of self-frame on the stack
2817   // and using the size of frame 2 to adjust the stack
2818   // when we are done the return to frame 3 will still be on the stack.
2819 
2820   // Pop deoptimized frame
2821   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2822   __ addptr(rsp, rcx);
2823 
2824   // rsp should be pointing at the return address to the caller (3)
2825 
2826   // Pick up the initial fp we should save
2827   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2828   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2829 
2830 #ifdef ASSERT
2831   // Compilers generate code that bang the stack by as much as the
2832   // interpreter would need. So this stack banging should never
2833   // trigger a fault. Verify that it does not on non product builds.
2834   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2835   __ bang_stack_size(rbx, rcx);
2836 #endif
2837 
2838   // Load address of array of frame pcs into rcx
2839   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2840 
2841   // Trash the old pc
2842   __ addptr(rsp, wordSize);
2843 
2844   // Load address of array of frame sizes into rsi
2845   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2846 
2847   // Load counter into rdx
2848   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2849 
2850   // Now adjust the caller's stack to make up for the extra locals
2851   // but record the original sp so that we can save it in the skeletal interpreter
2852   // frame and the stack walking of interpreter_sender will get the unextended sp
2853   // value and not the "real" sp value.
2854 
2855   const Register sender_sp = r8;
2856 
2857   __ mov(sender_sp, rsp);
2858   __ movl(rbx, Address(rdi,
2859                        Deoptimization::UnrollBlock::
2860                        caller_adjustment_offset_in_bytes()));
2861   __ subptr(rsp, rbx);
2862 
2863   // Push interpreter frames in a loop
2864   Label loop;
2865   __ bind(loop);
2866   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2867   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2868   __ pushptr(Address(rcx, 0));          // Save return address
2869   __ enter();                           // Save old & set new ebp
2870   __ subptr(rsp, rbx);                  // Prolog
2871   // This value is corrected by layout_activation_impl
2872   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2873   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2874   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2875   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2876   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2877   __ decrementl(rdx);                   // Decrement counter
2878   __ jcc(Assembler::notZero, loop);
2879   __ pushptr(Address(rcx, 0));          // Save final return address
2880 
2881   // Re-push self-frame
2882   __ enter();                           // Save old & set new ebp
2883 
2884   // Allocate a full sized register save area.
2885   // Return address and rbp are in place, so we allocate two less words.
2886   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2887 
2888   // Restore frame locals after moving the frame
2889   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2890   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2891 
2892   // Call C code.  Need thread but NOT official VM entry
2893   // crud.  We cannot block on this call, no GC can happen.  Call should
2894   // restore return values to their stack-slots with the new SP.
2895   //
2896   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2897 
2898   // Use rbp because the frames look interpreted now
2899   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2900   // Don't need the precise return PC here, just precise enough to point into this code blob.
2901   address the_pc = __ pc();
2902   __ set_last_Java_frame(noreg, rbp, the_pc);
2903 
2904   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2905   __ mov(c_rarg0, r15_thread);
2906   __ movl(c_rarg1, r14); // second arg: exec_mode
2907   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2908   // Revert SP alignment after call since we're going to do some SP relative addressing below
2909   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2910 
2911   // Set an oopmap for the call site
2912   // Use the same PC we used for the last java frame
2913   oop_maps->add_gc_map(the_pc - start,
2914                        new OopMap( frame_size_in_words, 0 ));
2915 
2916   // Clear fp AND pc
2917   __ reset_last_Java_frame(true);
2918 
2919   // Collect return values
2920   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2921   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2922   // I think this is useless (throwing pc?)
2923   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2924 
2925   // Pop self-frame.
2926   __ leave();                           // Epilog
2927 
2928   // Jump to interpreter
2929   __ ret(0);
2930 
2931   // Make sure all code is generated
2932   masm->flush();
2933 
2934   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2935   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2936 #if INCLUDE_JVMCI
2937   if (EnableJVMCI) {
2938     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2939     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2940   }
2941 #endif
2942 }
2943 
2944 #ifdef COMPILER2
2945 //------------------------------generate_uncommon_trap_blob--------------------
2946 void SharedRuntime::generate_uncommon_trap_blob() {
2947   // Allocate space for the code
2948   ResourceMark rm;
2949   // Setup code generation tools
2950   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2951   MacroAssembler* masm = new MacroAssembler(&buffer);
2952 
2953   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2954 
2955   address start = __ pc();
2956 
2957   if (UseRTMLocking) {
2958     // Abort RTM transaction before possible nmethod deoptimization.
2959     __ xabort(0);
2960   }
2961 
2962   // Push self-frame.  We get here with a return address on the
2963   // stack, so rsp is 8-byte aligned until we allocate our frame.
2964   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2965 
2966   // No callee saved registers. rbp is assumed implicitly saved
2967   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2968 
2969   // compiler left unloaded_class_index in j_rarg0 move to where the
2970   // runtime expects it.
2971   __ movl(c_rarg1, j_rarg0);
2972 
2973   __ set_last_Java_frame(noreg, noreg, NULL);
2974 
2975   // Call C code.  Need thread but NOT official VM entry
2976   // crud.  We cannot block on this call, no GC can happen.  Call should
2977   // capture callee-saved registers as well as return values.
2978   // Thread is in rdi already.
2979   //
2980   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2981 
2982   __ mov(c_rarg0, r15_thread);
2983   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2984   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2985 
2986   // Set an oopmap for the call site
2987   OopMapSet* oop_maps = new OopMapSet();
2988   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2989 
2990   // location of rbp is known implicitly by the frame sender code
2991 
2992   oop_maps->add_gc_map(__ pc() - start, map);
2993 
2994   __ reset_last_Java_frame(false);
2995 
2996   // Load UnrollBlock* into rdi
2997   __ mov(rdi, rax);
2998 
2999 #ifdef ASSERT
3000   { Label L;
3001     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
3002             (int32_t)Deoptimization::Unpack_uncommon_trap);
3003     __ jcc(Assembler::equal, L);
3004     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
3005     __ bind(L);
3006   }
3007 #endif
3008 
3009   // Pop all the frames we must move/replace.
3010   //
3011   // Frame picture (youngest to oldest)
3012   // 1: self-frame (no frame link)
3013   // 2: deopting frame  (no frame link)
3014   // 3: caller of deopting frame (could be compiled/interpreted).
3015 
3016   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
3017   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
3018 
3019   // Pop deoptimized frame (int)
3020   __ movl(rcx, Address(rdi,
3021                        Deoptimization::UnrollBlock::
3022                        size_of_deoptimized_frame_offset_in_bytes()));
3023   __ addptr(rsp, rcx);
3024 
3025   // rsp should be pointing at the return address to the caller (3)
3026 
3027   // Pick up the initial fp we should save
3028   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3029   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
3030 
3031 #ifdef ASSERT
3032   // Compilers generate code that bang the stack by as much as the
3033   // interpreter would need. So this stack banging should never
3034   // trigger a fault. Verify that it does not on non product builds.
3035   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
3036   __ bang_stack_size(rbx, rcx);
3037 #endif
3038 
3039   // Load address of array of frame pcs into rcx (address*)
3040   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
3041 
3042   // Trash the return pc
3043   __ addptr(rsp, wordSize);
3044 
3045   // Load address of array of frame sizes into rsi (intptr_t*)
3046   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
3047 
3048   // Counter
3049   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
3050 
3051   // Now adjust the caller's stack to make up for the extra locals but
3052   // record the original sp so that we can save it in the skeletal
3053   // interpreter frame and the stack walking of interpreter_sender
3054   // will get the unextended sp value and not the "real" sp value.
3055 
3056   const Register sender_sp = r8;
3057 
3058   __ mov(sender_sp, rsp);
3059   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
3060   __ subptr(rsp, rbx);
3061 
3062   // Push interpreter frames in a loop
3063   Label loop;
3064   __ bind(loop);
3065   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3066   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3067   __ pushptr(Address(rcx, 0));     // Save return address
3068   __ enter();                      // Save old & set new rbp
3069   __ subptr(rsp, rbx);             // Prolog
3070   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3071             sender_sp);            // Make it walkable
3072   // This value is corrected by layout_activation_impl
3073   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
3074   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3075   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3076   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3077   __ decrementl(rdx);              // Decrement counter
3078   __ jcc(Assembler::notZero, loop);
3079   __ pushptr(Address(rcx, 0));     // Save final return address
3080 
3081   // Re-push self-frame
3082   __ enter();                 // Save old & set new rbp
3083   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3084                               // Prolog
3085 
3086   // Use rbp because the frames look interpreted now
3087   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3088   // Don't need the precise return PC here, just precise enough to point into this code blob.
3089   address the_pc = __ pc();
3090   __ set_last_Java_frame(noreg, rbp, the_pc);
3091 
3092   // Call C code.  Need thread but NOT official VM entry
3093   // crud.  We cannot block on this call, no GC can happen.  Call should
3094   // restore return values to their stack-slots with the new SP.
3095   // Thread is in rdi already.
3096   //
3097   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3098 
3099   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3100   __ mov(c_rarg0, r15_thread);
3101   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3102   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3103 
3104   // Set an oopmap for the call site
3105   // Use the same PC we used for the last java frame
3106   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3107 
3108   // Clear fp AND pc
3109   __ reset_last_Java_frame(true);
3110 
3111   // Pop self-frame.
3112   __ leave();                 // Epilog
3113 
3114   // Jump to interpreter
3115   __ ret(0);
3116 
3117   // Make sure all code is generated
3118   masm->flush();
3119 
3120   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3121                                                  SimpleRuntimeFrame::framesize >> 1);
3122 }
3123 #endif // COMPILER2
3124 
3125 //------------------------------generate_handler_blob------
3126 //
3127 // Generate a special Compile2Runtime blob that saves all registers,
3128 // and setup oopmap.
3129 //
3130 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3131   assert(StubRoutines::forward_exception_entry() != NULL,
3132          "must be generated before");
3133 
3134   ResourceMark rm;
3135   OopMapSet *oop_maps = new OopMapSet();
3136   OopMap* map;
3137 
3138   // Allocate space for the code.  Setup code generation tools.
3139   CodeBuffer buffer("handler_blob", 2048, 1024);
3140   MacroAssembler* masm = new MacroAssembler(&buffer);
3141 
3142   address start   = __ pc();
3143   address call_pc = NULL;
3144   int frame_size_in_words;
3145   bool cause_return = (poll_type == POLL_AT_RETURN);
3146   bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3147 
3148   if (UseRTMLocking) {
3149     // Abort RTM transaction before calling runtime
3150     // because critical section will be large and will be
3151     // aborted anyway. Also nmethod could be deoptimized.
3152     __ xabort(0);
3153   }
3154 
3155   // Make room for return address (or push it again)
3156   if (!cause_return) {
3157     __ push(rbx);
3158   }
3159 
3160   // Save registers, fpu state, and flags
3161   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
3162 
3163   // The following is basically a call_VM.  However, we need the precise
3164   // address of the call in order to generate an oopmap. Hence, we do all the
3165   // work outselves.
3166 
3167   __ set_last_Java_frame(noreg, noreg, NULL);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3168 
3169   // The return address must always be correct so that frame constructor never
3170   // sees an invalid pc.
3171 
3172   if (!cause_return) {
3173     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3174     // Additionally, rbx is a callee saved register and we can look at it later to determine
3175     // if someone changed the return address for us!
3176     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3177     __ movptr(Address(rbp, wordSize), rbx);
3178   }
3179 
3180   // Do the call
3181   __ mov(c_rarg0, r15_thread);
3182   __ call(RuntimeAddress(call_ptr));
3183 
3184   // Set an oopmap for the call site.  This oopmap will map all
3185   // oop-registers and debug-info registers as callee-saved.  This
3186   // will allow deoptimization at this safepoint to find all possible
3187   // debug-info recordings, as well as let GC find all oops.
3188 
3189   oop_maps->add_gc_map( __ pc() - start, map);
3190 
3191   Label noException;
3192 
3193   __ reset_last_Java_frame(false);
3194 
3195   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3196   __ jcc(Assembler::equal, noException);
3197 
3198   // Exception pending
3199 
3200   RegisterSaver::restore_live_registers(masm, save_vectors);
3201 
3202   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3203 
3204   // No exception case
3205   __ bind(noException);
3206 
3207   Label no_adjust;
3208 #ifdef ASSERT
3209   Label bail;
3210 #endif
3211   if (!cause_return) {
3212     Label no_prefix, not_special;
3213 
3214     // If our stashed return pc was modified by the runtime we avoid touching it
3215     __ cmpptr(rbx, Address(rbp, wordSize));
3216     __ jccb(Assembler::notEqual, no_adjust);
3217 
3218     // Skip over the poll instruction.
3219     // See NativeInstruction::is_safepoint_poll()
3220     // Possible encodings:
3221     //      85 00       test   %eax,(%rax)
3222     //      85 01       test   %eax,(%rcx)
3223     //      85 02       test   %eax,(%rdx)
3224     //      85 03       test   %eax,(%rbx)
3225     //      85 06       test   %eax,(%rsi)
3226     //      85 07       test   %eax,(%rdi)
3227     //
3228     //   41 85 00       test   %eax,(%r8)
3229     //   41 85 01       test   %eax,(%r9)
3230     //   41 85 02       test   %eax,(%r10)
3231     //   41 85 03       test   %eax,(%r11)
3232     //   41 85 06       test   %eax,(%r14)
3233     //   41 85 07       test   %eax,(%r15)
3234     //
3235     //      85 04 24    test   %eax,(%rsp)
3236     //   41 85 04 24    test   %eax,(%r12)
3237     //      85 45 00    test   %eax,0x0(%rbp)
3238     //   41 85 45 00    test   %eax,0x0(%r13)
3239 
3240     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3241     __ jcc(Assembler::notEqual, no_prefix);
3242     __ addptr(rbx, 1);
3243     __ bind(no_prefix);
3244 #ifdef ASSERT
3245     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3246 #endif
3247     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3248     // r12/rsp 0x04
3249     // r13/rbp 0x05
3250     __ movzbq(rcx, Address(rbx, 1));
3251     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3252     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3253     __ cmpptr(rcx, 1);
3254     __ jcc(Assembler::above, not_special);
3255     __ addptr(rbx, 1);
3256     __ bind(not_special);
3257 #ifdef ASSERT
3258     // Verify the correct encoding of the poll we're about to skip.
3259     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3260     __ jcc(Assembler::notEqual, bail);
3261     // Mask out the modrm bits
3262     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3263     // rax encodes to 0, so if the bits are nonzero it's incorrect
3264     __ jcc(Assembler::notZero, bail);
3265 #endif
3266     // Adjust return pc forward to step over the safepoint poll instruction
3267     __ addptr(rbx, 2);
3268     __ movptr(Address(rbp, wordSize), rbx);
3269   }
3270 
3271   __ bind(no_adjust);
3272   // Normal exit, restore registers and exit.
3273   RegisterSaver::restore_live_registers(masm, save_vectors);
3274   __ ret(0);
3275 
3276 #ifdef ASSERT
3277   __ bind(bail);
3278   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3279 #endif
3280 
3281   // Make sure all code is generated
3282   masm->flush();
3283 
3284   // Fill-out other meta info
3285   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3286 }
3287 
3288 //
3289 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3290 //
3291 // Generate a stub that calls into vm to find out the proper destination
3292 // of a java call. All the argument registers are live at this point
3293 // but since this is generic code we don't know what they are and the caller
3294 // must do any gc of the args.
3295 //
3296 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3297   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3298 
3299   // allocate space for the code
3300   ResourceMark rm;
3301 
3302   CodeBuffer buffer(name, 1000, 512);
3303   MacroAssembler* masm                = new MacroAssembler(&buffer);
3304 
3305   int frame_size_in_words;
3306 
3307   OopMapSet *oop_maps = new OopMapSet();
3308   OopMap* map = NULL;
3309 
3310   int start = __ offset();
3311 
3312   // No need to save vector registers since they are caller-saved anyway.
3313   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
3314 
3315   // __ stop_if_in_cont(r10, "CONT 3");
3316 
3317   int frame_complete = __ offset();
3318 
3319   __ set_last_Java_frame(noreg, noreg, NULL);
3320 
3321   __ mov(c_rarg0, r15_thread);
3322 
3323   __ call(RuntimeAddress(destination));
3324 
3325 
3326   // Set an oopmap for the call site.
3327   // We need this not only for callee-saved registers, but also for volatile
3328   // registers that the compiler might be keeping live across a safepoint.
3329 
3330   oop_maps->add_gc_map( __ offset() - start, map);
3331 
3332   // rax contains the address we are going to jump to assuming no exception got installed
3333 
3334   // clear last_Java_sp
3335   __ reset_last_Java_frame(false);
3336   // check for pending exceptions
3337   Label pending;
3338   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3339   __ jcc(Assembler::notEqual, pending);
3340 
3341   // get the returned Method*
3342   __ get_vm_result_2(rbx, r15_thread);
3343   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3344 
3345   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3346 
3347   RegisterSaver::restore_live_registers(masm);
3348 
3349   // We are back the the original state on entry and ready to go.
3350 
3351   __ jmp(rax);
3352 
3353   // Pending exception after the safepoint
3354 
3355   __ bind(pending);
3356 
3357   RegisterSaver::restore_live_registers(masm);
3358 
3359   // exception pending => remove activation and forward to exception handler
3360 
3361   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3362 
3363   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3364   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3365 
3366   // -------------
3367   // make sure all code is generated
3368   masm->flush();
3369 
3370   // return the  blob
3371   // frame_size_words or bytes??
3372   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3373 }
3374 
3375 #ifdef COMPILER2
3376 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3377 
3378 class NativeInvokerGenerator : public StubCodeGenerator {
3379   address _call_target;
3380   int _shadow_space_bytes;
3381 
3382   const GrowableArray<VMReg>& _input_registers;
3383   const GrowableArray<VMReg>& _output_registers;
3384 
3385   int _frame_complete;
3386   int _framesize;
3387   OopMapSet* _oop_maps;
3388 public:
3389   NativeInvokerGenerator(CodeBuffer* buffer,
3390                          address call_target,
3391                          int shadow_space_bytes,
3392                          const GrowableArray<VMReg>& input_registers,
3393                          const GrowableArray<VMReg>& output_registers)
3394    : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3395      _call_target(call_target),
3396      _shadow_space_bytes(shadow_space_bytes),
3397      _input_registers(input_registers),
3398      _output_registers(output_registers),
3399      _frame_complete(0),
3400      _framesize(0),
3401      _oop_maps(NULL) {
3402     assert(_output_registers.length() <= 1
3403            || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3404 
3405   }
3406 
3407   void generate();
3408 
3409   int spill_size_in_bytes() const {
3410     if (_output_registers.length() == 0) {
3411       return 0;
3412     }
3413     VMReg reg = _output_registers.at(0);
3414     assert(reg->is_reg(), "must be a register");
3415     if (reg->is_Register()) {
3416       return 8;
3417     } else if (reg->is_XMMRegister()) {
3418       if (UseAVX >= 3) {
3419         return 64;
3420       } else if (UseAVX >= 1) {
3421         return 32;
3422       } else {
3423         return 16;
3424       }
3425     } else {
3426       ShouldNotReachHere();
3427     }
3428     return 0;
3429   }
3430 
3431   void spill_out_registers() {
3432     if (_output_registers.length() == 0) {
3433       return;
3434     }
3435     VMReg reg = _output_registers.at(0);
3436     assert(reg->is_reg(), "must be a register");
3437     MacroAssembler* masm = _masm;
3438     if (reg->is_Register()) {
3439       __ movptr(Address(rsp, 0), reg->as_Register());
3440     } else if (reg->is_XMMRegister()) {
3441       if (UseAVX >= 3) {
3442         __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3443       } else if (UseAVX >= 1) {
3444         __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3445       } else {
3446         __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3447       }
3448     } else {
3449       ShouldNotReachHere();
3450     }
3451   }
3452 
3453   void fill_out_registers() {
3454     if (_output_registers.length() == 0) {
3455       return;
3456     }
3457     VMReg reg = _output_registers.at(0);
3458     assert(reg->is_reg(), "must be a register");
3459     MacroAssembler* masm = _masm;
3460     if (reg->is_Register()) {
3461       __ movptr(reg->as_Register(), Address(rsp, 0));
3462     } else if (reg->is_XMMRegister()) {
3463       if (UseAVX >= 3) {
3464         __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3465       } else if (UseAVX >= 1) {
3466         __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3467       } else {
3468         __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3469       }
3470     } else {
3471       ShouldNotReachHere();
3472     }
3473   }
3474 
3475   int frame_complete() const {
3476     return _frame_complete;
3477   }
3478 
3479   int framesize() const {
3480     return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3481   }
3482 
3483   OopMapSet* oop_maps() const {
3484     return _oop_maps;
3485   }
3486 
3487 private:
3488 #ifdef ASSERT
3489 bool target_uses_register(VMReg reg) {
3490   return _input_registers.contains(reg) || _output_registers.contains(reg);
3491 }
3492 #endif
3493 };
3494 
3495 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3496                                                 int shadow_space_bytes,
3497                                                 const GrowableArray<VMReg>& input_registers,
3498                                                 const GrowableArray<VMReg>& output_registers) {
3499   int locs_size  = 64;
3500   CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3501   NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3502   g.generate();
3503   code.log_section_sizes("nep_invoker_blob");
3504 
3505   RuntimeStub* stub =
3506     RuntimeStub::new_runtime_stub("nep_invoker_blob",
3507                                   &code,
3508                                   g.frame_complete(),
3509                                   g.framesize(),
3510                                   g.oop_maps(), false);
3511   return stub;
3512 }
3513 
3514 void NativeInvokerGenerator::generate() {
3515   assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3516 
3517   enum layout {
3518     rbp_off,
3519     rbp_off2,
3520     return_off,
3521     return_off2,
3522     framesize // inclusive of return address
3523   };
3524 
3525   _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3526   assert(is_even(_framesize/2), "sp not 16-byte aligned");
3527 
3528   _oop_maps  = new OopMapSet();
3529   MacroAssembler* masm = _masm;
3530 
3531   address start = __ pc();
3532 
3533   __ enter();
3534 
3535   // return address and rbp are already in place
3536   __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3537 
3538   _frame_complete = __ pc() - start;
3539 
3540   address the_pc = __ pc();
3541 
3542   __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3543   OopMap* map = new OopMap(_framesize, 0);
3544   _oop_maps->add_gc_map(the_pc - start, map);
3545 
3546   // State transition
3547   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3548 
3549   __ call(RuntimeAddress(_call_target));
3550 
3551   __ restore_cpu_control_state_after_jni();
3552 
3553   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3554 
3555   // Force this write out before the read below
3556   __ membar(Assembler::Membar_mask_bits(
3557           Assembler::LoadLoad | Assembler::LoadStore |
3558           Assembler::StoreLoad | Assembler::StoreStore));
3559 
3560   Label L_after_safepoint_poll;
3561   Label L_safepoint_poll_slow_path;
3562 
3563   __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3564   __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3565   __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3566 
3567   __ bind(L_after_safepoint_poll);
3568 
3569   // change thread state
3570   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3571 
3572   __ block_comment("reguard stack check");
3573   Label L_reguard;
3574   Label L_after_reguard;
3575   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3576   __ jcc(Assembler::equal, L_reguard);
3577   __ bind(L_after_reguard);
3578 
3579   __ reset_last_Java_frame(r15_thread, true);
3580 
3581   __ leave(); // required for proper stackwalking of RuntimeStub frame
3582   __ ret(0);
3583 
3584   //////////////////////////////////////////////////////////////////////////////
3585 
3586   __ block_comment("{ L_safepoint_poll_slow_path");
3587   __ bind(L_safepoint_poll_slow_path);
3588   __ vzeroupper();
3589 
3590   spill_out_registers();
3591 
3592   __ mov(c_rarg0, r15_thread);
3593   __ mov(r12, rsp); // remember sp
3594   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3595   __ andptr(rsp, -16); // align stack as required by ABI
3596   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3597   __ mov(rsp, r12); // restore sp
3598   __ reinit_heapbase();
3599 
3600   fill_out_registers();
3601 
3602   __ jmp(L_after_safepoint_poll);
3603   __ block_comment("} L_safepoint_poll_slow_path");
3604 
3605   //////////////////////////////////////////////////////////////////////////////
3606 
3607   __ block_comment("{ L_reguard");
3608   __ bind(L_reguard);
3609   __ vzeroupper();
3610 
3611   spill_out_registers();
3612 
3613   __ mov(r12, rsp); // remember sp
3614   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3615   __ andptr(rsp, -16); // align stack as required by ABI
3616   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3617   __ mov(rsp, r12); // restore sp
3618   __ reinit_heapbase();
3619 
3620   fill_out_registers();
3621 
3622   __ jmp(L_after_reguard);
3623 
3624   __ block_comment("} L_reguard");
3625 
3626   //////////////////////////////////////////////////////////////////////////////
3627 
3628   __ flush();
3629 }
3630 #endif // COMPILER2
3631 
3632 //------------------------------Montgomery multiplication------------------------
3633 //
3634 
3635 #ifndef _WINDOWS
3636 
3637 // Subtract 0:b from carry:a.  Return carry.
3638 static julong
3639 sub(julong a[], julong b[], julong carry, long len) {
3640   long long i = 0, cnt = len;
3641   julong tmp;
3642   asm volatile("clc; "
3643                "0: ; "
3644                "mov (%[b], %[i], 8), %[tmp]; "
3645                "sbb %[tmp], (%[a], %[i], 8); "
3646                "inc %[i]; dec %[cnt]; "
3647                "jne 0b; "
3648                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3649                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3650                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3651                : "memory");
3652   return tmp;
3653 }
3654 
3655 // Multiply (unsigned) Long A by Long B, accumulating the double-
3656 // length result into the accumulator formed of T0, T1, and T2.
3657 #define MACC(A, B, T0, T1, T2)                                  \
3658 do {                                                            \
3659   unsigned long hi, lo;                                         \
3660   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3661            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3662            : "r"(A), "a"(B) : "cc");                            \
3663  } while(0)
3664 
3665 // As above, but add twice the double-length result into the
3666 // accumulator.
3667 #define MACC2(A, B, T0, T1, T2)                                 \
3668 do {                                                            \
3669   unsigned long hi, lo;                                         \
3670   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3671            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3672            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3673            : "r"(A), "a"(B) : "cc");                            \
3674  } while(0)
3675 
3676 #else //_WINDOWS
3677 
3678 static julong
3679 sub(julong a[], julong b[], julong carry, long len) {
3680   long i;
3681   julong tmp;
3682   unsigned char c = 1;
3683   for (i = 0; i < len; i++) {
3684     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3685     a[i] = tmp;
3686   }
3687   c = _addcarry_u64(c, carry, ~0, &tmp);
3688   return tmp;
3689 }
3690 
3691 // Multiply (unsigned) Long A by Long B, accumulating the double-
3692 // length result into the accumulator formed of T0, T1, and T2.
3693 #define MACC(A, B, T0, T1, T2)                          \
3694 do {                                                    \
3695   julong hi, lo;                            \
3696   lo = _umul128(A, B, &hi);                             \
3697   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3698   c = _addcarry_u64(c, hi, T1, &T1);                    \
3699   _addcarry_u64(c, T2, 0, &T2);                         \
3700  } while(0)
3701 
3702 // As above, but add twice the double-length result into the
3703 // accumulator.
3704 #define MACC2(A, B, T0, T1, T2)                         \
3705 do {                                                    \
3706   julong hi, lo;                            \
3707   lo = _umul128(A, B, &hi);                             \
3708   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3709   c = _addcarry_u64(c, hi, T1, &T1);                    \
3710   _addcarry_u64(c, T2, 0, &T2);                         \
3711   c = _addcarry_u64(0, lo, T0, &T0);                    \
3712   c = _addcarry_u64(c, hi, T1, &T1);                    \
3713   _addcarry_u64(c, T2, 0, &T2);                         \
3714  } while(0)
3715 
3716 #endif //_WINDOWS
3717 
3718 // Fast Montgomery multiplication.  The derivation of the algorithm is
3719 // in  A Cryptographic Library for the Motorola DSP56000,
3720 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3721 
3722 static void NOINLINE
3723 montgomery_multiply(julong a[], julong b[], julong n[],
3724                     julong m[], julong inv, int len) {
3725   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3726   int i;
3727 
3728   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3729 
3730   for (i = 0; i < len; i++) {
3731     int j;
3732     for (j = 0; j < i; j++) {
3733       MACC(a[j], b[i-j], t0, t1, t2);
3734       MACC(m[j], n[i-j], t0, t1, t2);
3735     }
3736     MACC(a[i], b[0], t0, t1, t2);
3737     m[i] = t0 * inv;
3738     MACC(m[i], n[0], t0, t1, t2);
3739 
3740     assert(t0 == 0, "broken Montgomery multiply");
3741 
3742     t0 = t1; t1 = t2; t2 = 0;
3743   }
3744 
3745   for (i = len; i < 2*len; i++) {
3746     int j;
3747     for (j = i-len+1; j < len; j++) {
3748       MACC(a[j], b[i-j], t0, t1, t2);
3749       MACC(m[j], n[i-j], t0, t1, t2);
3750     }
3751     m[i-len] = t0;
3752     t0 = t1; t1 = t2; t2 = 0;
3753   }
3754 
3755   while (t0)
3756     t0 = sub(m, n, t0, len);
3757 }
3758 
3759 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3760 // multiplies so it should be up to 25% faster than Montgomery
3761 // multiplication.  However, its loop control is more complex and it
3762 // may actually run slower on some machines.
3763 
3764 static void NOINLINE
3765 montgomery_square(julong a[], julong n[],
3766                   julong m[], julong inv, int len) {
3767   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3768   int i;
3769 
3770   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3771 
3772   for (i = 0; i < len; i++) {
3773     int j;
3774     int end = (i+1)/2;
3775     for (j = 0; j < end; j++) {
3776       MACC2(a[j], a[i-j], t0, t1, t2);
3777       MACC(m[j], n[i-j], t0, t1, t2);
3778     }
3779     if ((i & 1) == 0) {
3780       MACC(a[j], a[j], t0, t1, t2);
3781     }
3782     for (; j < i; j++) {
3783       MACC(m[j], n[i-j], t0, t1, t2);
3784     }
3785     m[i] = t0 * inv;
3786     MACC(m[i], n[0], t0, t1, t2);
3787 
3788     assert(t0 == 0, "broken Montgomery square");
3789 
3790     t0 = t1; t1 = t2; t2 = 0;
3791   }
3792 
3793   for (i = len; i < 2*len; i++) {
3794     int start = i-len+1;
3795     int end = start + (len - start)/2;
3796     int j;
3797     for (j = start; j < end; j++) {
3798       MACC2(a[j], a[i-j], t0, t1, t2);
3799       MACC(m[j], n[i-j], t0, t1, t2);
3800     }
3801     if ((i & 1) == 0) {
3802       MACC(a[j], a[j], t0, t1, t2);
3803     }
3804     for (; j < len; j++) {
3805       MACC(m[j], n[i-j], t0, t1, t2);
3806     }
3807     m[i-len] = t0;
3808     t0 = t1; t1 = t2; t2 = 0;
3809   }
3810 
3811   while (t0)
3812     t0 = sub(m, n, t0, len);
3813 }
3814 
3815 // Swap words in a longword.
3816 static julong swap(julong x) {
3817   return (x << 32) | (x >> 32);
3818 }
3819 
3820 // Copy len longwords from s to d, word-swapping as we go.  The
3821 // destination array is reversed.
3822 static void reverse_words(julong *s, julong *d, int len) {
3823   d += len;
3824   while(len-- > 0) {
3825     d--;
3826     *d = swap(*s);
3827     s++;
3828   }
3829 }
3830 
3831 // The threshold at which squaring is advantageous was determined
3832 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3833 #define MONTGOMERY_SQUARING_THRESHOLD 64
3834 
3835 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3836                                         jint len, jlong inv,
3837                                         jint *m_ints) {
3838   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3839   int longwords = len/2;
3840 
3841   // Make very sure we don't use so much space that the stack might
3842   // overflow.  512 jints corresponds to an 16384-bit integer and
3843   // will use here a total of 8k bytes of stack space.
3844   int total_allocation = longwords * sizeof (julong) * 4;
3845   guarantee(total_allocation <= 8192, "must be");
3846   julong *scratch = (julong *)alloca(total_allocation);
3847 
3848   // Local scratch arrays
3849   julong
3850     *a = scratch + 0 * longwords,
3851     *b = scratch + 1 * longwords,
3852     *n = scratch + 2 * longwords,
3853     *m = scratch + 3 * longwords;
3854 
3855   reverse_words((julong *)a_ints, a, longwords);
3856   reverse_words((julong *)b_ints, b, longwords);
3857   reverse_words((julong *)n_ints, n, longwords);
3858 
3859   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3860 
3861   reverse_words(m, (julong *)m_ints, longwords);
3862 }
3863 
3864 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3865                                       jint len, jlong inv,
3866                                       jint *m_ints) {
3867   assert(len % 2 == 0, "array length in montgomery_square must be even");
3868   int longwords = len/2;
3869 
3870   // Make very sure we don't use so much space that the stack might
3871   // overflow.  512 jints corresponds to an 16384-bit integer and
3872   // will use here a total of 6k bytes of stack space.
3873   int total_allocation = longwords * sizeof (julong) * 3;
3874   guarantee(total_allocation <= 8192, "must be");
3875   julong *scratch = (julong *)alloca(total_allocation);
3876 
3877   // Local scratch arrays
3878   julong
3879     *a = scratch + 0 * longwords,
3880     *n = scratch + 1 * longwords,
3881     *m = scratch + 2 * longwords;
3882 
3883   reverse_words((julong *)a_ints, a, longwords);
3884   reverse_words((julong *)n_ints, n, longwords);
3885 
3886   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3887     ::montgomery_square(a, n, m, (julong)inv, longwords);
3888   } else {
3889     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3890   }
3891 
3892   reverse_words(m, (julong *)m_ints, longwords);
3893 }
3894 
3895 #ifdef COMPILER2
3896 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3897 //
3898 //------------------------------generate_exception_blob---------------------------
3899 // creates exception blob at the end
3900 // Using exception blob, this code is jumped from a compiled method.
3901 // (see emit_exception_handler in x86_64.ad file)
3902 //
3903 // Given an exception pc at a call we call into the runtime for the
3904 // handler in this method. This handler might merely restore state
3905 // (i.e. callee save registers) unwind the frame and jump to the
3906 // exception handler for the nmethod if there is no Java level handler
3907 // for the nmethod.
3908 //
3909 // This code is entered with a jmp.
3910 //
3911 // Arguments:
3912 //   rax: exception oop
3913 //   rdx: exception pc
3914 //
3915 // Results:
3916 //   rax: exception oop
3917 //   rdx: exception pc in caller or ???
3918 //   destination: exception handler of caller
3919 //
3920 // Note: the exception pc MUST be at a call (precise debug information)
3921 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3922 //
3923 
3924 void OptoRuntime::generate_exception_blob() {
3925   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3926   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3927   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3928 
3929   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3930 
3931   // Allocate space for the code
3932   ResourceMark rm;
3933   // Setup code generation tools
3934   CodeBuffer buffer("exception_blob", 2048, 1024);
3935   MacroAssembler* masm = new MacroAssembler(&buffer);
3936 
3937 
3938   address start = __ pc();
3939 
3940   // Exception pc is 'return address' for stack walker
3941   __ push(rdx);
3942   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3943 
3944   // Save callee-saved registers.  See x86_64.ad.
3945 
3946   // rbp is an implicitly saved callee saved register (i.e., the calling
3947   // convention will save/restore it in the prolog/epilog). Other than that
3948   // there are no callee save registers now that adapter frames are gone.
3949 
3950   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3951 
3952   // Store exception in Thread object. We cannot pass any arguments to the
3953   // handle_exception call, since we do not want to make any assumption
3954   // about the size of the frame where the exception happened in.
3955   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3956   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3957   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3958 
3959   // This call does all the hard work.  It checks if an exception handler
3960   // exists in the method.
3961   // If so, it returns the handler address.
3962   // If not, it prepares for stack-unwinding, restoring the callee-save
3963   // registers of the frame being removed.
3964   //
3965   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3966 
3967   // At a method handle call, the stack may not be properly aligned
3968   // when returning with an exception.
3969   address the_pc = __ pc();
3970   __ set_last_Java_frame(noreg, noreg, the_pc);
3971   __ mov(c_rarg0, r15_thread);
3972   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3973   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3974 
3975   // Set an oopmap for the call site.  This oopmap will only be used if we
3976   // are unwinding the stack.  Hence, all locations will be dead.
3977   // Callee-saved registers will be the same as the frame above (i.e.,
3978   // handle_exception_stub), since they were restored when we got the
3979   // exception.
3980 
3981   OopMapSet* oop_maps = new OopMapSet();
3982 
3983   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3984 
3985   __ reset_last_Java_frame(false);
3986 
3987   // Restore callee-saved registers
3988 
3989   // rbp is an implicitly saved callee-saved register (i.e., the calling
3990   // convention will save restore it in prolog/epilog) Other than that
3991   // there are no callee save registers now that adapter frames are gone.
3992 
3993   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3994 
3995   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3996   __ pop(rdx);                  // No need for exception pc anymore
3997 
3998   // rax: exception handler
3999 
4000   // We have a handler in rax (could be deopt blob).
4001   __ mov(r8, rax);
4002 
4003   // Get the exception oop
4004   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
4005   // Get the exception pc in case we are deoptimized
4006   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
4007 #ifdef ASSERT
4008   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
4009   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
4010 #endif
4011   // Clear the exception oop so GC no longer processes it as a root.
4012   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
4013 
4014   // rax: exception oop
4015   // r8:  exception handler
4016   // rdx: exception pc
4017   // Jump to handler
4018 
4019   __ jmp(r8);
4020 
4021   // Make sure all code is generated
4022   masm->flush();
4023 
4024   // Set exception blob
4025   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
4026 }
4027 #endif // COMPILER2
4028 
4029 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
4030                                        int total_in_args, const VMRegPair* in_regs,
4031                                        int total_out_args, VMRegPair* out_regs,
4032                                        GrowableArray<int>& arg_order,
4033                                        VMRegPair tmp_vmreg) {
4034   ComputeMoveOrder order(total_in_args, in_regs,
4035                          total_out_args, out_regs,
4036                          in_sig_bt, arg_order, tmp_vmreg);
4037 }