1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "classfile/symbolTable.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/icBuffer.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/compiledICHolder.hpp"
  46 #include "oops/klass.inline.hpp"
  47 #include "prims/methodHandles.hpp"
  48 #include "runtime/jniHandles.hpp"
  49 #include "runtime/safepointMechanism.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/signature.hpp"
  52 #include "runtime/stubRoutines.hpp"
  53 #include "runtime/vframeArray.hpp"
  54 #include "runtime/vm_version.hpp"
  55 #include "utilities/align.hpp"
  56 #include "utilities/formatBuffer.hpp"
  57 #include "vmreg_x86.inline.hpp"
  58 #ifdef COMPILER1
  59 #include "c1/c1_Runtime1.hpp"
  60 #endif
  61 #ifdef COMPILER2
  62 #include "opto/runtime.hpp"
  63 #endif
  64 #if INCLUDE_JVMCI
  65 #include "jvmci/jvmciJavaClasses.hpp"
  66 #endif
  67 
  68 #define __ masm->
  69 
  70 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  71 
  72 class SimpleRuntimeFrame {
  73 
  74   public:
  75 
  76   // Most of the runtime stubs have this simple frame layout.
  77   // This class exists to make the layout shared in one place.
  78   // Offsets are for compiler stack slots, which are jints.
  79   enum layout {
  80     // The frame sender code expects that rbp will be in the "natural" place and
  81     // will override any oopMap setting for it. We must therefore force the layout
  82     // so that it agrees with the frame sender code.
  83     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  84     rbp_off2,
  85     return_off, return_off2,
  86     framesize
  87   };
  88 };
  89 
  90 class RegisterSaver {
  91   // Capture info about frame layout.  Layout offsets are in jint
  92   // units because compiler frame slots are jints.
  93 #define XSAVE_AREA_BEGIN 160
  94 #define XSAVE_AREA_YMM_BEGIN 576
  95 #define XSAVE_AREA_OPMASK_BEGIN 1088
  96 #define XSAVE_AREA_ZMM_BEGIN 1152
  97 #define XSAVE_AREA_UPPERBANK 1664
  98 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  99 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 100 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 101 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 102 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 103   enum layout {
 104     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 105     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 106     DEF_XMM_OFFS(0),
 107     DEF_XMM_OFFS(1),
 108     // 2..15 are implied in range usage
 109     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 110     DEF_YMM_OFFS(0),
 111     DEF_YMM_OFFS(1),
 112     // 2..15 are implied in range usage
 113     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 114     DEF_OPMASK_OFFS(0),
 115     DEF_OPMASK_OFFS(1),
 116     // 2..7 are implied in range usage
 117     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 118     DEF_ZMM_OFFS(0),
 119     DEF_ZMM_OFFS(1),
 120     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 121     DEF_ZMM_UPPER_OFFS(16),
 122     DEF_ZMM_UPPER_OFFS(17),
 123     // 18..31 are implied in range usage
 124     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 125     fpu_stateH_end,
 126     r15_off, r15H_off,
 127     r14_off, r14H_off,
 128     r13_off, r13H_off,
 129     r12_off, r12H_off,
 130     r11_off, r11H_off,
 131     r10_off, r10H_off,
 132     r9_off,  r9H_off,
 133     r8_off,  r8H_off,
 134     rdi_off, rdiH_off,
 135     rsi_off, rsiH_off,
 136     ignore_off, ignoreH_off,  // extra copy of rbp
 137     rsp_off, rspH_off,
 138     rbx_off, rbxH_off,
 139     rdx_off, rdxH_off,
 140     rcx_off, rcxH_off,
 141     rax_off, raxH_off,
 142     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 143     align_off, alignH_off,
 144     flags_off, flagsH_off,
 145     // The frame sender code expects that rbp will be in the "natural" place and
 146     // will override any oopMap setting for it. We must therefore force the layout
 147     // so that it agrees with the frame sender code.
 148     rbp_off, rbpH_off,        // copy of rbp we will restore
 149     return_off, returnH_off,  // slot for return address
 150     reg_save_size             // size in compiler stack slots
 151   };
 152 
 153  public:
 154   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors);
 155   static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 156 
 157   // Offsets into the register save area
 158   // Used by deoptimization when it is managing result register
 159   // values on its own
 160 
 161   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 162   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 163   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 164   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 165   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 166 
 167   // During deoptimization only the result registers need to be restored,
 168   // all the other values have already been extracted.
 169   static void restore_result_registers(MacroAssembler* masm);
 170 };
 171 
 172 // Register is a class, but it would be assigned numerical value.
 173 // "0" is assigned for rax. Thus we need to ignore -Wnonnull.
 174 PRAGMA_DIAG_PUSH
 175 PRAGMA_NONNULL_IGNORED
 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
 177   int off = 0;
 178   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 179   if (UseAVX < 3) {
 180     num_xmm_regs = num_xmm_regs/2;
 181   }
 182 #if COMPILER2_OR_JVMCI
 183   if (save_vectors && UseAVX == 0) {
 184     save_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 185   }
 186   assert(!save_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 187 #else
 188   save_vectors = false; // vectors are generated only by C2 and JVMCI
 189 #endif
 190 
 191   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 192   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 193   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 194   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 195   // CodeBlob frame size is in words.
 196   int frame_size_in_words = frame_size_in_bytes / wordSize;
 197   *total_frame_words = frame_size_in_words;
 198 
 199   // Save registers, fpu state, and flags.
 200   // We assume caller has already pushed the return address onto the
 201   // stack, so rsp is 8-byte aligned here.
 202   // We push rpb twice in this sequence because we want the real rbp
 203   // to be under the return like a normal enter.
 204 
 205   __ enter();          // rsp becomes 16-byte aligned here
 206   __ push_CPU_state(); // Push a multiple of 16 bytes
 207 
 208   // push cpu state handles this on EVEX enabled targets
 209   if (save_vectors) {
 210     // Save upper half of YMM registers(0..15)
 211     int base_addr = XSAVE_AREA_YMM_BEGIN;
 212     for (int n = 0; n < 16; n++) {
 213       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 214     }
 215     if (VM_Version::supports_evex()) {
 216       // Save upper half of ZMM registers(0..15)
 217       base_addr = XSAVE_AREA_ZMM_BEGIN;
 218       for (int n = 0; n < 16; n++) {
 219         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 220       }
 221       // Save full ZMM registers(16..num_xmm_regs)
 222       base_addr = XSAVE_AREA_UPPERBANK;
 223       off = 0;
 224       int vector_len = Assembler::AVX_512bit;
 225       for (int n = 16; n < num_xmm_regs; n++) {
 226         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 227       }
 228 #if COMPILER2_OR_JVMCI
 229       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 230       off = 0;
 231       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 232         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 233       }
 234 #endif
 235     }
 236   } else {
 237     if (VM_Version::supports_evex()) {
 238       // Save upper bank of ZMM registers(16..31) for double/float usage
 239       int base_addr = XSAVE_AREA_UPPERBANK;
 240       off = 0;
 241       for (int n = 16; n < num_xmm_regs; n++) {
 242         __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
 243       }
 244 #if COMPILER2_OR_JVMCI
 245       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 246       off = 0;
 247       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 248         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 249       }
 250 #endif
 251     }
 252   }
 253   __ vzeroupper();
 254   if (frame::arg_reg_save_area_bytes != 0) {
 255     // Allocate argument register save area
 256     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 257   }
 258 
 259   // Set an oopmap for the call site.  This oopmap will map all
 260   // oop-registers and debug-info registers as callee-saved.  This
 261   // will allow deoptimization at this safepoint to find all possible
 262   // debug-info recordings, as well as let GC find all oops.
 263 
 264   OopMapSet *oop_maps = new OopMapSet();
 265   OopMap* map = new OopMap(frame_size_in_slots, 0);
 266 
 267 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 268 
 269   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 271   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 272   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 273   // rbp location is known implicitly by the frame sender code, needs no oopmap
 274   // and the location where rbp was saved by is ignored
 275   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 283   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 284   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 285   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 286   // on EVEX enabled targets, we get it included in the xsave area
 287   off = xmm0_off;
 288   int delta = xmm1_off - off;
 289   for (int n = 0; n < 16; n++) {
 290     XMMRegister xmm_name = as_XMMRegister(n);
 291     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 292     off += delta;
 293   }
 294   if (UseAVX > 2) {
 295     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 296     off = zmm16_off;
 297     delta = zmm17_off - off;
 298     for (int n = 16; n < num_xmm_regs; n++) {
 299       XMMRegister zmm_name = as_XMMRegister(n);
 300       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 301       off += delta;
 302     }
 303   }
 304 
 305 #if COMPILER2_OR_JVMCI
 306   if (save_vectors) {
 307     // Save upper half of YMM registers(0..15)
 308     off = ymm0_off;
 309     delta = ymm1_off - ymm0_off;
 310     for (int n = 0; n < 16; n++) {
 311       XMMRegister ymm_name = as_XMMRegister(n);
 312       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 313       off += delta;
 314     }
 315     if (VM_Version::supports_evex()) {
 316       // Save upper half of ZMM registers(0..15)
 317       off = zmm0_off;
 318       delta = zmm1_off - zmm0_off;
 319       for (int n = 0; n < 16; n++) {
 320         XMMRegister zmm_name = as_XMMRegister(n);
 321         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 322         off += delta;
 323       }
 324     }
 325   }
 326 #endif // COMPILER2_OR_JVMCI
 327 
 328   // %%% These should all be a waste but we'll keep things as they were for now
 329   if (true) {
 330     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 334     // rbp location is known implicitly by the frame sender code, needs no oopmap
 335     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 343     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 344     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 345     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 346     // on EVEX enabled targets, we get it included in the xsave area
 347     off = xmm0H_off;
 348     delta = xmm1H_off - off;
 349     for (int n = 0; n < 16; n++) {
 350       XMMRegister xmm_name = as_XMMRegister(n);
 351       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 352       off += delta;
 353     }
 354     if (UseAVX > 2) {
 355       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 356       off = zmm16H_off;
 357       delta = zmm17H_off - off;
 358       for (int n = 16; n < num_xmm_regs; n++) {
 359         XMMRegister zmm_name = as_XMMRegister(n);
 360         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 361         off += delta;
 362       }
 363     }
 364   }
 365 
 366   return map;
 367 }
 368 PRAGMA_DIAG_POP
 369 
 370 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
 371   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 372   if (UseAVX < 3) {
 373     num_xmm_regs = num_xmm_regs/2;
 374   }
 375   if (frame::arg_reg_save_area_bytes != 0) {
 376     // Pop arg register save area
 377     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 378   }
 379 
 380 #if COMPILER2_OR_JVMCI
 381   if (restore_vectors) {
 382     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 383     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 384   }
 385 #else
 386   assert(!restore_vectors, "vectors are generated only by C2");
 387 #endif
 388 
 389   __ vzeroupper();
 390 
 391   // On EVEX enabled targets everything is handled in pop fpu state
 392   if (restore_vectors) {
 393     // Restore upper half of YMM registers (0..15)
 394     int base_addr = XSAVE_AREA_YMM_BEGIN;
 395     for (int n = 0; n < 16; n++) {
 396       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 397     }
 398     if (VM_Version::supports_evex()) {
 399       // Restore upper half of ZMM registers (0..15)
 400       base_addr = XSAVE_AREA_ZMM_BEGIN;
 401       for (int n = 0; n < 16; n++) {
 402         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 403       }
 404       // Restore full ZMM registers(16..num_xmm_regs)
 405       base_addr = XSAVE_AREA_UPPERBANK;
 406       int vector_len = Assembler::AVX_512bit;
 407       int off = 0;
 408       for (int n = 16; n < num_xmm_regs; n++) {
 409         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 410       }
 411 #if COMPILER2_OR_JVMCI
 412       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 413       off = 0;
 414       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 415         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 416       }
 417 #endif
 418     }
 419   } else {
 420     if (VM_Version::supports_evex()) {
 421       // Restore upper bank of ZMM registers(16..31) for double/float usage
 422       int base_addr = XSAVE_AREA_UPPERBANK;
 423       int off = 0;
 424       for (int n = 16; n < num_xmm_regs; n++) {
 425         __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
 426       }
 427 #if COMPILER2_OR_JVMCI
 428       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 429       off = 0;
 430       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 431         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 432       }
 433 #endif
 434     }
 435   }
 436 
 437   // Recover CPU state
 438   __ pop_CPU_state();
 439   // Get the rbp described implicitly by the calling convention (no oopMap)
 440   __ pop(rbp);
 441 }
 442 
 443 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 444 
 445   // Just restore result register. Only used by deoptimization. By
 446   // now any callee save register that needs to be restored to a c2
 447   // caller of the deoptee has been extracted into the vframeArray
 448   // and will be stuffed into the c2i adapter we create for later
 449   // restoration so only result registers need to be restored here.
 450 
 451   // Restore fp result register
 452   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 453   // Restore integer result register
 454   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 455   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 456 
 457   // Pop all of the register save are off the stack except the return address
 458   __ addptr(rsp, return_offset_in_bytes());
 459 }
 460 
 461 // Is vector's size (in bytes) bigger than a size saved by default?
 462 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 463 bool SharedRuntime::is_wide_vector(int size) {
 464   return size > 16;
 465 }
 466 
 467 // ---------------------------------------------------------------------------
 468 // Read the array of BasicTypes from a signature, and compute where the
 469 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 470 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 471 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 472 // as framesizes are fixed.
 473 // VMRegImpl::stack0 refers to the first slot 0(sp).
 474 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 475 // up to RegisterImpl::number_of_registers) are the 64-bit
 476 // integer registers.
 477 
 478 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 479 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 480 // units regardless of build. Of course for i486 there is no 64 bit build
 481 
 482 // The Java calling convention is a "shifted" version of the C ABI.
 483 // By skipping the first C ABI register we can call non-static jni methods
 484 // with small numbers of arguments without having to shuffle the arguments
 485 // at all. Since we control the java ABI we ought to at least get some
 486 // advantage out of it.
 487 
 488 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 489                                            VMRegPair *regs,
 490                                            int total_args_passed) {
 491 
 492   // Create the mapping between argument positions and
 493   // registers.
 494   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 495     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 496   };
 497   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 498     j_farg0, j_farg1, j_farg2, j_farg3,
 499     j_farg4, j_farg5, j_farg6, j_farg7
 500   };
 501 
 502 
 503   uint int_args = 0;
 504   uint fp_args = 0;
 505   uint stk_args = 0; // inc by 2 each time
 506 
 507   for (int i = 0; i < total_args_passed; i++) {
 508     switch (sig_bt[i]) {
 509     case T_BOOLEAN:
 510     case T_CHAR:
 511     case T_BYTE:
 512     case T_SHORT:
 513     case T_INT:
 514       if (int_args < Argument::n_int_register_parameters_j) {
 515         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 516       } else {
 517         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 518         stk_args += 2;
 519       }
 520       break;
 521     case T_VOID:
 522       // halves of T_LONG or T_DOUBLE
 523       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 524       regs[i].set_bad();
 525       break;
 526     case T_LONG:
 527       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 528       // fall through
 529     case T_OBJECT:
 530     case T_ARRAY:
 531     case T_ADDRESS:
 532     case T_PRIMITIVE_OBJECT:
 533       if (int_args < Argument::n_int_register_parameters_j) {
 534         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 535       } else {
 536         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 537         stk_args += 2;
 538       }
 539       break;
 540     case T_FLOAT:
 541       if (fp_args < Argument::n_float_register_parameters_j) {
 542         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 543       } else {
 544         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 545         stk_args += 2;
 546       }
 547       break;
 548     case T_DOUBLE:
 549       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 550       if (fp_args < Argument::n_float_register_parameters_j) {
 551         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 552       } else {
 553         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 554         stk_args += 2;
 555       }
 556       break;
 557     default:
 558       ShouldNotReachHere();
 559       break;
 560     }
 561   }
 562 
 563   return align_up(stk_args, 2);
 564 }
 565 
 566 // Same as java_calling_convention() but for multiple return
 567 // values. There's no way to store them on the stack so if we don't
 568 // have enough registers, multiple values can't be returned.
 569 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 570 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 571 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 572                                           VMRegPair *regs,
 573                                           int total_args_passed) {
 574   // Create the mapping between argument positions and
 575   // registers.
 576   static const Register INT_ArgReg[java_return_convention_max_int] = {
 577     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 578   };
 579   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 580     j_farg0, j_farg1, j_farg2, j_farg3,
 581     j_farg4, j_farg5, j_farg6, j_farg7
 582   };
 583 
 584 
 585   uint int_args = 0;
 586   uint fp_args = 0;
 587 
 588   for (int i = 0; i < total_args_passed; i++) {
 589     switch (sig_bt[i]) {
 590     case T_BOOLEAN:
 591     case T_CHAR:
 592     case T_BYTE:
 593     case T_SHORT:
 594     case T_INT:
 595       if (int_args < Argument::n_int_register_parameters_j+1) {
 596         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 597         int_args++;
 598       } else {
 599         return -1;
 600       }
 601       break;
 602     case T_VOID:
 603       // halves of T_LONG or T_DOUBLE
 604       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 605       regs[i].set_bad();
 606       break;
 607     case T_LONG:
 608       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 609       // fall through
 610     case T_OBJECT:
 611     case T_PRIMITIVE_OBJECT:
 612     case T_ARRAY:
 613     case T_ADDRESS:
 614     case T_METADATA:
 615       if (int_args < Argument::n_int_register_parameters_j+1) {
 616         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 617         int_args++;
 618       } else {
 619         return -1;
 620       }
 621       break;
 622     case T_FLOAT:
 623       if (fp_args < Argument::n_float_register_parameters_j) {
 624         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 625         fp_args++;
 626       } else {
 627         return -1;
 628       }
 629       break;
 630     case T_DOUBLE:
 631       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 632       if (fp_args < Argument::n_float_register_parameters_j) {
 633         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 634         fp_args++;
 635       } else {
 636         return -1;
 637       }
 638       break;
 639     default:
 640       ShouldNotReachHere();
 641       break;
 642     }
 643   }
 644 
 645   return int_args + fp_args;
 646 }
 647 
 648 // Patch the callers callsite with entry to compiled code if it exists.
 649 static void patch_callers_callsite(MacroAssembler *masm) {
 650   Label L;
 651   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 652   __ jcc(Assembler::equal, L);
 653 
 654   // Save the current stack pointer
 655   __ mov(r13, rsp);
 656   // Schedule the branch target address early.
 657   // Call into the VM to patch the caller, then jump to compiled callee
 658   // rax isn't live so capture return address while we easily can
 659   __ movptr(rax, Address(rsp, 0));
 660 
 661   // align stack so push_CPU_state doesn't fault
 662   __ andptr(rsp, -(StackAlignmentInBytes));
 663   __ push_CPU_state();
 664   __ vzeroupper();
 665   // VM needs caller's callsite
 666   // VM needs target method
 667   // This needs to be a long call since we will relocate this adapter to
 668   // the codeBuffer and it may not reach
 669 
 670   // Allocate argument register save area
 671   if (frame::arg_reg_save_area_bytes != 0) {
 672     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 673   }
 674   __ mov(c_rarg0, rbx);
 675   __ mov(c_rarg1, rax);
 676   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 677 
 678   // De-allocate argument register save area
 679   if (frame::arg_reg_save_area_bytes != 0) {
 680     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 681   }
 682 
 683   __ vzeroupper();
 684   __ pop_CPU_state();
 685   // restore sp
 686   __ mov(rsp, r13);
 687   __ bind(L);
 688 }
 689 
 690 // For each inline type argument, sig includes the list of fields of
 691 // the inline type. This utility function computes the number of
 692 // arguments for the call if inline types are passed by reference (the
 693 // calling convention the interpreter expects).
 694 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 695   int total_args_passed = 0;
 696   if (InlineTypePassFieldsAsArgs) {
 697     for (int i = 0; i < sig_extended->length(); i++) {
 698       BasicType bt = sig_extended->at(i)._bt;
 699       if (bt == T_PRIMITIVE_OBJECT) {
 700         // In sig_extended, an inline type argument starts with:
 701         // T_PRIMITIVE_OBJECT, followed by the types of the fields of the
 702         // inline type and T_VOID to mark the end of the value
 703         // type. Inline types are flattened so, for instance, in the
 704         // case of an inline type with an int field and an inline type
 705         // field that itself has 2 fields, an int and a long:
 706         // T_PRIMITIVE_OBJECT T_INT T_PRIMITIVE_OBJECT T_INT T_LONG T_VOID (second
 707         // slot for the T_LONG) T_VOID (inner T_PRIMITIVE_OBJECT) T_VOID
 708         // (outer T_PRIMITIVE_OBJECT)
 709         total_args_passed++;
 710         int vt = 1;
 711         do {
 712           i++;
 713           BasicType bt = sig_extended->at(i)._bt;
 714           BasicType prev_bt = sig_extended->at(i-1)._bt;
 715           if (bt == T_PRIMITIVE_OBJECT) {
 716             vt++;
 717           } else if (bt == T_VOID &&
 718                      prev_bt != T_LONG &&
 719                      prev_bt != T_DOUBLE) {
 720             vt--;
 721           }
 722         } while (vt != 0);
 723       } else {
 724         total_args_passed++;
 725       }
 726     }
 727   } else {
 728     total_args_passed = sig_extended->length();
 729   }
 730   return total_args_passed;
 731 }
 732 
 733 
 734 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 735                                    BasicType bt,
 736                                    BasicType prev_bt,
 737                                    size_t size_in_bytes,
 738                                    const VMRegPair& reg_pair,
 739                                    const Address& to,
 740                                    int extraspace,
 741                                    bool is_oop) {
 742   assert(bt != T_PRIMITIVE_OBJECT || !InlineTypePassFieldsAsArgs, "no inline type here");
 743   if (bt == T_VOID) {
 744     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 745     return;
 746   }
 747 
 748   // Say 4 args:
 749   // i   st_off
 750   // 0   32 T_LONG
 751   // 1   24 T_VOID
 752   // 2   16 T_OBJECT
 753   // 3    8 T_BOOL
 754   // -    0 return address
 755   //
 756   // However to make thing extra confusing. Because we can fit a long/double in
 757   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 758   // leaves one slot empty and only stores to a single slot. In this case the
 759   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 760 
 761   bool wide = (size_in_bytes == wordSize);
 762   VMReg r_1 = reg_pair.first();
 763   VMReg r_2 = reg_pair.second();
 764   assert(r_2->is_valid() == wide, "invalid size");
 765   if (!r_1->is_valid()) {
 766     assert(!r_2->is_valid(), "must be invalid");
 767     return;
 768   }
 769 
 770   if (!r_1->is_XMMRegister()) {
 771     Register val = rax;
 772     if (r_1->is_stack()) {
 773       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 774       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 775     } else {
 776       val = r_1->as_Register();
 777     }
 778     assert_different_registers(to.base(), val, rscratch1);
 779     if (is_oop) {
 780       __ push(r13);
 781       __ push(rbx);
 782       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 783       __ pop(rbx);
 784       __ pop(r13);
 785     } else {
 786       __ store_sized_value(to, val, size_in_bytes);
 787     }
 788   } else {
 789     if (wide) {
 790       __ movdbl(to, r_1->as_XMMRegister());
 791     } else {
 792       __ movflt(to, r_1->as_XMMRegister());
 793     }
 794   }
 795 }
 796 
 797 static void gen_c2i_adapter(MacroAssembler *masm,
 798                             const GrowableArray<SigEntry>* sig_extended,
 799                             const VMRegPair *regs,
 800                             Label& skip_fixup,
 801                             address start,
 802                             OopMapSet* oop_maps,
 803                             int& frame_complete,
 804                             int& frame_size_in_words,
 805                             bool alloc_inline_receiver) {
 806   // Before we get into the guts of the C2I adapter, see if we should be here
 807   // at all.  We've come from compiled code and are attempting to jump to the
 808   // interpreter, which means the caller made a static call to get here
 809   // (vcalls always get a compiled target if there is one).  Check for a
 810   // compiled target.  If there is one, we need to patch the caller's call.
 811   patch_callers_callsite(masm);
 812 
 813   __ bind(skip_fixup);
 814 
 815   if (InlineTypePassFieldsAsArgs) {
 816     // Is there an inline type argument?
 817     bool has_inline_argument = false;
 818     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 819       has_inline_argument = (sig_extended->at(i)._bt == T_PRIMITIVE_OBJECT);
 820     }
 821     if (has_inline_argument) {
 822       // There is at least an inline type argument: we're coming from
 823       // compiled code so we have no buffers to back the inline types.
 824       // Allocate the buffers here with a runtime call.
 825       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
 826 
 827       frame_complete = __ offset();
 828 
 829       __ set_last_Java_frame(noreg, noreg, NULL);
 830 
 831       __ mov(c_rarg0, r15_thread);
 832       __ mov(c_rarg1, rbx);
 833       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 834       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 835 
 836       oop_maps->add_gc_map((int)(__ pc() - start), map);
 837       __ reset_last_Java_frame(false);
 838 
 839       RegisterSaver::restore_live_registers(masm);
 840 
 841       Label no_exception;
 842       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 843       __ jcc(Assembler::equal, no_exception);
 844 
 845       __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
 846       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 847       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 848 
 849       __ bind(no_exception);
 850 
 851       // We get an array of objects from the runtime call
 852       __ get_vm_result(rscratch2, r15_thread); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 853       __ get_vm_result_2(rbx, r15_thread); // TODO: required to keep the callee Method live?
 854     }
 855   }
 856 
 857   // Since all args are passed on the stack, total_args_passed *
 858   // Interpreter::stackElementSize is the space we need. Plus 1 because
 859   // we also account for the return address location since
 860   // we store it first rather than hold it in rax across all the shuffling
 861   int total_args_passed = compute_total_args_passed_int(sig_extended);
 862   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 863 
 864   // stack is aligned, keep it that way
 865   extraspace = align_up(extraspace, 2*wordSize);
 866 
 867   // Get return address
 868   __ pop(rax);
 869 
 870   // set senderSP value
 871   __ mov(r13, rsp);
 872 
 873   __ subptr(rsp, extraspace);
 874 
 875   // Store the return address in the expected location
 876   __ movptr(Address(rsp, 0), rax);
 877 
 878   // Now write the args into the outgoing interpreter space
 879 
 880   // next_arg_comp is the next argument from the compiler point of
 881   // view (inline type fields are passed in registers/on the stack). In
 882   // sig_extended, an inline type argument starts with: T_PRIMITIVE_OBJECT,
 883   // followed by the types of the fields of the inline type and T_VOID
 884   // to mark the end of the inline type. ignored counts the number of
 885   // T_PRIMITIVE_OBJECT/T_VOID. next_vt_arg is the next inline type argument:
 886   // used to get the buffer for that argument from the pool of buffers
 887   // we allocated above and want to pass to the
 888   // interpreter. next_arg_int is the next argument from the
 889   // interpreter point of view (inline types are passed by reference).
 890   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
 891        next_arg_comp < sig_extended->length(); next_arg_comp++) {
 892     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
 893     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
 894     BasicType bt = sig_extended->at(next_arg_comp)._bt;
 895     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
 896     if (!InlineTypePassFieldsAsArgs || bt != T_PRIMITIVE_OBJECT) {
 897       int next_off = st_off - Interpreter::stackElementSize;
 898       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
 899       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
 900       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
 901       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
 902                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
 903       next_arg_int++;
 904 #ifdef ASSERT
 905       if (bt == T_LONG || bt == T_DOUBLE) {
 906         // Overwrite the unused slot with known junk
 907         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 908         __ movptr(Address(rsp, st_off), rax);
 909       }
 910 #endif /* ASSERT */
 911     } else {
 912       ignored++;
 913       // get the buffer from the just allocated pool of buffers
 914       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_PRIMITIVE_OBJECT);
 915       __ load_heap_oop(r14, Address(rscratch2, index));
 916       next_vt_arg++; next_arg_int++;
 917       int vt = 1;
 918       // write fields we get from compiled code in registers/stack
 919       // slots to the buffer: we know we are done with that inline type
 920       // argument when we hit the T_VOID that acts as an end of inline
 921       // type delimiter for this inline type. Inline types are flattened
 922       // so we might encounter embedded inline types. Each entry in
 923       // sig_extended contains a field offset in the buffer.
 924       Label L_null;
 925       do {
 926         next_arg_comp++;
 927         BasicType bt = sig_extended->at(next_arg_comp)._bt;
 928         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
 929         if (bt == T_PRIMITIVE_OBJECT) {
 930           vt++;
 931           ignored++;
 932         } else if (bt == T_VOID &&
 933                    prev_bt != T_LONG &&
 934                    prev_bt != T_DOUBLE) {
 935           vt--;
 936           ignored++;
 937         } else {
 938           int off = sig_extended->at(next_arg_comp)._offset;
 939           if (off == -1) {
 940             // Nullable inline type argument, emit null check
 941             VMReg reg = regs[next_arg_comp-ignored].first();
 942             Label L_notNull;
 943             if (reg->is_stack()) {
 944               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 945               __ testb(Address(rsp, ld_off), 1);
 946             } else {
 947               __ testb(reg->as_Register(), 1);
 948             }
 949             __ jcc(Assembler::notZero, L_notNull);
 950             __ movptr(Address(rsp, st_off), 0);
 951             __ jmp(L_null);
 952             __ bind(L_notNull);
 953             continue;
 954           }
 955           assert(off > 0, "offset in object should be positive");
 956           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
 957           bool is_oop = is_reference_type(bt);
 958           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
 959                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
 960         }
 961       } while (vt != 0);
 962       // pass the buffer to the interpreter
 963       __ movptr(Address(rsp, st_off), r14);
 964       __ bind(L_null);
 965     }
 966   }
 967 
 968   // Schedule the branch target address early.
 969   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 970   __ jmp(rcx);
 971 }
 972 
 973 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 974                         address code_start, address code_end,
 975                         Label& L_ok) {
 976   Label L_fail;
 977   __ lea(temp_reg, ExternalAddress(code_start));
 978   __ cmpptr(pc_reg, temp_reg);
 979   __ jcc(Assembler::belowEqual, L_fail);
 980   __ lea(temp_reg, ExternalAddress(code_end));
 981   __ cmpptr(pc_reg, temp_reg);
 982   __ jcc(Assembler::below, L_ok);
 983   __ bind(L_fail);
 984 }
 985 
 986 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 987                                     int comp_args_on_stack,
 988                                     const GrowableArray<SigEntry>* sig,
 989                                     const VMRegPair *regs) {
 990 
 991   // Note: r13 contains the senderSP on entry. We must preserve it since
 992   // we may do a i2c -> c2i transition if we lose a race where compiled
 993   // code goes non-entrant while we get args ready.
 994   // In addition we use r13 to locate all the interpreter args as
 995   // we must align the stack to 16 bytes on an i2c entry else we
 996   // lose alignment we expect in all compiled code and register
 997   // save code can segv when fxsave instructions find improperly
 998   // aligned stack pointer.
 999 
1000   // Adapters can be frameless because they do not require the caller
1001   // to perform additional cleanup work, such as correcting the stack pointer.
1002   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1003   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1004   // even if a callee has modified the stack pointer.
1005   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1006   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1007   // up via the senderSP register).
1008   // In other words, if *either* the caller or callee is interpreted, we can
1009   // get the stack pointer repaired after a call.
1010   // This is why c2i and i2c adapters cannot be indefinitely composed.
1011   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1012   // both caller and callee would be compiled methods, and neither would
1013   // clean up the stack pointer changes performed by the two adapters.
1014   // If this happens, control eventually transfers back to the compiled
1015   // caller, but with an uncorrected stack, causing delayed havoc.
1016 
1017   // Pick up the return address
1018   __ movptr(rax, Address(rsp, 0));
1019 
1020   if (VerifyAdapterCalls &&
1021       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
1022     // So, let's test for cascading c2i/i2c adapters right now.
1023     //  assert(Interpreter::contains($return_addr) ||
1024     //         StubRoutines::contains($return_addr),
1025     //         "i2c adapter must return to an interpreter frame");
1026     __ block_comment("verify_i2c { ");
1027     Label L_ok;
1028     if (Interpreter::code() != NULL)
1029       range_check(masm, rax, r11,
1030                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
1031                   L_ok);
1032     if (StubRoutines::code1() != NULL)
1033       range_check(masm, rax, r11,
1034                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
1035                   L_ok);
1036     if (StubRoutines::code2() != NULL)
1037       range_check(masm, rax, r11,
1038                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
1039                   L_ok);
1040     const char* msg = "i2c adapter must return to an interpreter frame";
1041     __ block_comment(msg);
1042     __ stop(msg);
1043     __ bind(L_ok);
1044     __ block_comment("} verify_i2ce ");
1045   }
1046 
1047   // Must preserve original SP for loading incoming arguments because
1048   // we need to align the outgoing SP for compiled code.
1049   __ movptr(r11, rsp);
1050 
1051   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
1052   // in registers, we will occasionally have no stack args.
1053   int comp_words_on_stack = 0;
1054   if (comp_args_on_stack) {
1055     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
1056     // registers are below.  By subtracting stack0, we either get a negative
1057     // number (all values in registers) or the maximum stack slot accessed.
1058 
1059     // Convert 4-byte c2 stack slots to words.
1060     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1061     // Round up to miminum stack alignment, in wordSize
1062     comp_words_on_stack = align_up(comp_words_on_stack, 2);
1063     __ subptr(rsp, comp_words_on_stack * wordSize);
1064   }
1065 
1066 
1067   // Ensure compiled code always sees stack at proper alignment
1068   __ andptr(rsp, -16);
1069 
1070   // push the return address and misalign the stack that youngest frame always sees
1071   // as far as the placement of the call instruction
1072   __ push(rax);
1073 
1074   // Put saved SP in another register
1075   const Register saved_sp = rax;
1076   __ movptr(saved_sp, r11);
1077 
1078   // Will jump to the compiled code just as if compiled code was doing it.
1079   // Pre-load the register-jump target early, to schedule it better.
1080   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1081 
1082 #if INCLUDE_JVMCI
1083   if (EnableJVMCI) {
1084     // check if this call should be routed towards a specific entry point
1085     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1086     Label no_alternative_target;
1087     __ jcc(Assembler::equal, no_alternative_target);
1088     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1089     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1090     __ bind(no_alternative_target);
1091   }
1092 #endif // INCLUDE_JVMCI
1093 
1094   int total_args_passed = sig->length();
1095 
1096   // Now generate the shuffle code.  Pick up all register args and move the
1097   // rest through the floating point stack top.
1098   for (int i = 0; i < total_args_passed; i++) {
1099     BasicType bt = sig->at(i)._bt;
1100     assert(bt != T_PRIMITIVE_OBJECT, "i2c adapter doesn't unpack inline type args");
1101     if (bt == T_VOID) {
1102       // Longs and doubles are passed in native word order, but misaligned
1103       // in the 32-bit build.
1104       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1105       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1106       continue;
1107     }
1108 
1109     // Pick up 0, 1 or 2 words from SP+offset.
1110 
1111     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1112             "scrambled load targets?");
1113     // Load in argument order going down.
1114     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1115     // Point to interpreter value (vs. tag)
1116     int next_off = ld_off - Interpreter::stackElementSize;
1117     //
1118     //
1119     //
1120     VMReg r_1 = regs[i].first();
1121     VMReg r_2 = regs[i].second();
1122     if (!r_1->is_valid()) {
1123       assert(!r_2->is_valid(), "");
1124       continue;
1125     }
1126     if (r_1->is_stack()) {
1127       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1128       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1129 
1130       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1131       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1132       // will be generated.
1133       if (!r_2->is_valid()) {
1134         // sign extend???
1135         __ movl(r13, Address(saved_sp, ld_off));
1136         __ movptr(Address(rsp, st_off), r13);
1137       } else {
1138         //
1139         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1140         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1141         // So we must adjust where to pick up the data to match the interpreter.
1142         //
1143         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1144         // are accessed as negative so LSW is at LOW address
1145 
1146         // ld_off is MSW so get LSW
1147         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1148                            next_off : ld_off;
1149         __ movq(r13, Address(saved_sp, offset));
1150         // st_off is LSW (i.e. reg.first())
1151         __ movq(Address(rsp, st_off), r13);
1152       }
1153     } else if (r_1->is_Register()) {  // Register argument
1154       Register r = r_1->as_Register();
1155       assert(r != rax, "must be different");
1156       if (r_2->is_valid()) {
1157         //
1158         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1159         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1160         // So we must adjust where to pick up the data to match the interpreter.
1161 
1162         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1163                            next_off : ld_off;
1164 
1165         // this can be a misaligned move
1166         __ movq(r, Address(saved_sp, offset));
1167       } else {
1168         // sign extend and use a full word?
1169         __ movl(r, Address(saved_sp, ld_off));
1170       }
1171     } else {
1172       if (!r_2->is_valid()) {
1173         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1174       } else {
1175         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1176       }
1177     }
1178   }
1179 
1180   // 6243940 We might end up in handle_wrong_method if
1181   // the callee is deoptimized as we race thru here. If that
1182   // happens we don't want to take a safepoint because the
1183   // caller frame will look interpreted and arguments are now
1184   // "compiled" so it is much better to make this transition
1185   // invisible to the stack walking code. Unfortunately if
1186   // we try and find the callee by normal means a safepoint
1187   // is possible. So we stash the desired callee in the thread
1188   // and the vm will find there should this case occur.
1189 
1190   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1191 
1192   // put Method* where a c2i would expect should we end up there
1193   // only needed because of c2 resolve stubs return Method* as a result in
1194   // rax
1195   __ mov(rax, rbx);
1196   __ jmp(r11);
1197 }
1198 
1199 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1200   Label ok;
1201 
1202   Register holder = rax;
1203   Register receiver = j_rarg0;
1204   Register temp = rbx;
1205 
1206   __ load_klass(temp, receiver, rscratch1);
1207   __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
1208   __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
1209   __ jcc(Assembler::equal, ok);
1210   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1211 
1212   __ bind(ok);
1213   // Method might have been compiled since the call site was patched to
1214   // interpreted if that is the case treat it as a miss so we can get
1215   // the call site corrected.
1216   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1217   __ jcc(Assembler::equal, skip_fixup);
1218   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1219 }
1220 
1221 // ---------------------------------------------------------------
1222 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1223                                                             int comp_args_on_stack,
1224                                                             const GrowableArray<SigEntry>* sig,
1225                                                             const VMRegPair* regs,
1226                                                             const GrowableArray<SigEntry>* sig_cc,
1227                                                             const VMRegPair* regs_cc,
1228                                                             const GrowableArray<SigEntry>* sig_cc_ro,
1229                                                             const VMRegPair* regs_cc_ro,
1230                                                             AdapterFingerPrint* fingerprint,
1231                                                             AdapterBlob*& new_adapter,
1232                                                             bool allocate_code_blob) {
1233   address i2c_entry = __ pc();
1234   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1235 
1236   // -------------------------------------------------------------------------
1237   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1238   // to the interpreter.  The args start out packed in the compiled layout.  They
1239   // need to be unpacked into the interpreter layout.  This will almost always
1240   // require some stack space.  We grow the current (compiled) stack, then repack
1241   // the args.  We  finally end in a jump to the generic interpreter entry point.
1242   // On exit from the interpreter, the interpreter will restore our SP (lest the
1243   // compiled code, which relys solely on SP and not RBP, get sick).
1244 
1245   address c2i_unverified_entry = __ pc();
1246   Label skip_fixup;
1247 
1248   gen_inline_cache_check(masm, skip_fixup);
1249 
1250   OopMapSet* oop_maps = new OopMapSet();
1251   int frame_complete = CodeOffsets::frame_never_safe;
1252   int frame_size_in_words = 0;
1253 
1254   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1255   address c2i_inline_ro_entry = __ pc();
1256   if (regs_cc != regs_cc_ro) {
1257     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, false);
1258     skip_fixup.reset();
1259   }
1260 
1261   // Scalarized c2i adapter
1262   address c2i_entry = __ pc();
1263 
1264   // Class initialization barrier for static methods
1265   address c2i_no_clinit_check_entry = NULL;
1266   if (VM_Version::supports_fast_class_init_checks()) {
1267     Label L_skip_barrier;
1268     Register method = rbx;
1269 
1270     { // Bypass the barrier for non-static methods
1271       Register flags  = rscratch1;
1272       __ movl(flags, Address(method, Method::access_flags_offset()));
1273       __ testl(flags, JVM_ACC_STATIC);
1274       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1275     }
1276 
1277     Register klass = rscratch1;
1278     __ load_method_holder(klass, method);
1279     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1280 
1281     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1282 
1283     __ bind(L_skip_barrier);
1284     c2i_no_clinit_check_entry = __ pc();
1285   }
1286 
1287   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1288   bs->c2i_entry_barrier(masm);
1289 
1290   gen_c2i_adapter(masm, sig_cc, regs_cc, skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, true);
1291 
1292   address c2i_unverified_inline_entry = c2i_unverified_entry;
1293 
1294   // Non-scalarized c2i adapter
1295   address c2i_inline_entry = c2i_entry;
1296   if (regs != regs_cc) {
1297     Label inline_entry_skip_fixup;
1298     c2i_unverified_inline_entry = __ pc();
1299     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1300 
1301     c2i_inline_entry = __ pc();
1302     gen_c2i_adapter(masm, sig, regs, inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, false);
1303   }
1304 
1305   __ flush();
1306 
1307   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1308   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1309   if (allocate_code_blob) {
1310     bool caller_must_gc_arguments = (regs != regs_cc);
1311     new_adapter = AdapterBlob::create(masm->code(), frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1312   }
1313 
1314   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry, c2i_unverified_inline_entry, c2i_no_clinit_check_entry);
1315 }
1316 
1317 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1318                                          VMRegPair *regs,
1319                                          VMRegPair *regs2,
1320                                          int total_args_passed) {
1321   assert(regs2 == NULL, "not needed on x86");
1322 // We return the amount of VMRegImpl stack slots we need to reserve for all
1323 // the arguments NOT counting out_preserve_stack_slots.
1324 
1325 // NOTE: These arrays will have to change when c1 is ported
1326 #ifdef _WIN64
1327     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1328       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1329     };
1330     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1331       c_farg0, c_farg1, c_farg2, c_farg3
1332     };
1333 #else
1334     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1335       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1336     };
1337     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1338       c_farg0, c_farg1, c_farg2, c_farg3,
1339       c_farg4, c_farg5, c_farg6, c_farg7
1340     };
1341 #endif // _WIN64
1342 
1343 
1344     uint int_args = 0;
1345     uint fp_args = 0;
1346     uint stk_args = 0; // inc by 2 each time
1347 
1348     for (int i = 0; i < total_args_passed; i++) {
1349       switch (sig_bt[i]) {
1350       case T_BOOLEAN:
1351       case T_CHAR:
1352       case T_BYTE:
1353       case T_SHORT:
1354       case T_INT:
1355         if (int_args < Argument::n_int_register_parameters_c) {
1356           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1357 #ifdef _WIN64
1358           fp_args++;
1359           // Allocate slots for callee to stuff register args the stack.
1360           stk_args += 2;
1361 #endif
1362         } else {
1363           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1364           stk_args += 2;
1365         }
1366         break;
1367       case T_LONG:
1368         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1369         // fall through
1370       case T_OBJECT:
1371       case T_ARRAY:
1372       case T_PRIMITIVE_OBJECT:
1373       case T_ADDRESS:
1374       case T_METADATA:
1375         if (int_args < Argument::n_int_register_parameters_c) {
1376           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1377 #ifdef _WIN64
1378           fp_args++;
1379           stk_args += 2;
1380 #endif
1381         } else {
1382           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1383           stk_args += 2;
1384         }
1385         break;
1386       case T_FLOAT:
1387         if (fp_args < Argument::n_float_register_parameters_c) {
1388           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1389 #ifdef _WIN64
1390           int_args++;
1391           // Allocate slots for callee to stuff register args the stack.
1392           stk_args += 2;
1393 #endif
1394         } else {
1395           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1396           stk_args += 2;
1397         }
1398         break;
1399       case T_DOUBLE:
1400         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1401         if (fp_args < Argument::n_float_register_parameters_c) {
1402           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1403 #ifdef _WIN64
1404           int_args++;
1405           // Allocate slots for callee to stuff register args the stack.
1406           stk_args += 2;
1407 #endif
1408         } else {
1409           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1410           stk_args += 2;
1411         }
1412         break;
1413       case T_VOID: // Halves of longs and doubles
1414         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1415         regs[i].set_bad();
1416         break;
1417       default:
1418         ShouldNotReachHere();
1419         break;
1420       }
1421     }
1422 #ifdef _WIN64
1423   // windows abi requires that we always allocate enough stack space
1424   // for 4 64bit registers to be stored down.
1425   if (stk_args < 8) {
1426     stk_args = 8;
1427   }
1428 #endif // _WIN64
1429 
1430   return stk_args;
1431 }
1432 
1433 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1434                                              uint num_bits,
1435                                              uint total_args_passed) {
1436   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1437          "only certain vector sizes are supported for now");
1438 
1439   static const XMMRegister VEC_ArgReg[32] = {
1440      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1441      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1442     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1443     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1444   };
1445 
1446   uint stk_args = 0;
1447   uint fp_args = 0;
1448 
1449   for (uint i = 0; i < total_args_passed; i++) {
1450     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1451     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1452     regs[i].set_pair(vmreg->next(next_val), vmreg);
1453   }
1454 
1455   return stk_args;
1456 }
1457 
1458 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1459   // We always ignore the frame_slots arg and just use the space just below frame pointer
1460   // which by this time is free to use
1461   switch (ret_type) {
1462   case T_FLOAT:
1463     __ movflt(Address(rbp, -wordSize), xmm0);
1464     break;
1465   case T_DOUBLE:
1466     __ movdbl(Address(rbp, -wordSize), xmm0);
1467     break;
1468   case T_VOID:  break;
1469   default: {
1470     __ movptr(Address(rbp, -wordSize), rax);
1471     }
1472   }
1473 }
1474 
1475 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1476   // We always ignore the frame_slots arg and just use the space just below frame pointer
1477   // which by this time is free to use
1478   switch (ret_type) {
1479   case T_FLOAT:
1480     __ movflt(xmm0, Address(rbp, -wordSize));
1481     break;
1482   case T_DOUBLE:
1483     __ movdbl(xmm0, Address(rbp, -wordSize));
1484     break;
1485   case T_VOID:  break;
1486   default: {
1487     __ movptr(rax, Address(rbp, -wordSize));
1488     }
1489   }
1490 }
1491 
1492 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1493     for ( int i = first_arg ; i < arg_count ; i++ ) {
1494       if (args[i].first()->is_Register()) {
1495         __ push(args[i].first()->as_Register());
1496       } else if (args[i].first()->is_XMMRegister()) {
1497         __ subptr(rsp, 2*wordSize);
1498         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1499       }
1500     }
1501 }
1502 
1503 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1504     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1505       if (args[i].first()->is_Register()) {
1506         __ pop(args[i].first()->as_Register());
1507       } else if (args[i].first()->is_XMMRegister()) {
1508         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1509         __ addptr(rsp, 2*wordSize);
1510       }
1511     }
1512 }
1513 
1514 // Different signatures may require very different orders for the move
1515 // to avoid clobbering other arguments.  There's no simple way to
1516 // order them safely.  Compute a safe order for issuing stores and
1517 // break any cycles in those stores.  This code is fairly general but
1518 // it's not necessary on the other platforms so we keep it in the
1519 // platform dependent code instead of moving it into a shared file.
1520 // (See bugs 7013347 & 7145024.)
1521 // Note that this code is specific to LP64.
1522 class ComputeMoveOrder: public StackObj {
1523   class MoveOperation: public ResourceObj {
1524     friend class ComputeMoveOrder;
1525    private:
1526     VMRegPair        _src;
1527     VMRegPair        _dst;
1528     int              _src_index;
1529     int              _dst_index;
1530     bool             _processed;
1531     MoveOperation*  _next;
1532     MoveOperation*  _prev;
1533 
1534     static int get_id(VMRegPair r) {
1535       return r.first()->value();
1536     }
1537 
1538    public:
1539     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1540       _src(src)
1541     , _dst(dst)
1542     , _src_index(src_index)
1543     , _dst_index(dst_index)
1544     , _processed(false)
1545     , _next(NULL)
1546     , _prev(NULL) {
1547     }
1548 
1549     VMRegPair src() const              { return _src; }
1550     int src_id() const                 { return get_id(src()); }
1551     int src_index() const              { return _src_index; }
1552     VMRegPair dst() const              { return _dst; }
1553     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1554     int dst_index() const              { return _dst_index; }
1555     int dst_id() const                 { return get_id(dst()); }
1556     MoveOperation* next() const       { return _next; }
1557     MoveOperation* prev() const       { return _prev; }
1558     void set_processed()               { _processed = true; }
1559     bool is_processed() const          { return _processed; }
1560 
1561     // insert
1562     void break_cycle(VMRegPair temp_register) {
1563       // create a new store following the last store
1564       // to move from the temp_register to the original
1565       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1566 
1567       // break the cycle of links and insert new_store at the end
1568       // break the reverse link.
1569       MoveOperation* p = prev();
1570       assert(p->next() == this, "must be");
1571       _prev = NULL;
1572       p->_next = new_store;
1573       new_store->_prev = p;
1574 
1575       // change the original store to save it's value in the temp.
1576       set_dst(-1, temp_register);
1577     }
1578 
1579     void link(GrowableArray<MoveOperation*>& killer) {
1580       // link this store in front the store that it depends on
1581       MoveOperation* n = killer.at_grow(src_id(), NULL);
1582       if (n != NULL) {
1583         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1584         _next = n;
1585         n->_prev = this;
1586       }
1587     }
1588   };
1589 
1590  private:
1591   GrowableArray<MoveOperation*> edges;
1592 
1593  public:
1594   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1595                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1596     // Move operations where the dest is the stack can all be
1597     // scheduled first since they can't interfere with the other moves.
1598     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1599       if (in_sig_bt[i] == T_ARRAY) {
1600         c_arg--;
1601         if (out_regs[c_arg].first()->is_stack() &&
1602             out_regs[c_arg + 1].first()->is_stack()) {
1603           arg_order.push(i);
1604           arg_order.push(c_arg);
1605         } else {
1606           if (out_regs[c_arg].first()->is_stack() ||
1607               in_regs[i].first() == out_regs[c_arg].first()) {
1608             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1609           } else {
1610             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1611           }
1612         }
1613       } else if (in_sig_bt[i] == T_VOID) {
1614         arg_order.push(i);
1615         arg_order.push(c_arg);
1616       } else {
1617         if (out_regs[c_arg].first()->is_stack() ||
1618             in_regs[i].first() == out_regs[c_arg].first()) {
1619           arg_order.push(i);
1620           arg_order.push(c_arg);
1621         } else {
1622           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1623         }
1624       }
1625     }
1626     // Break any cycles in the register moves and emit the in the
1627     // proper order.
1628     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1629     for (int i = 0; i < stores->length(); i++) {
1630       arg_order.push(stores->at(i)->src_index());
1631       arg_order.push(stores->at(i)->dst_index());
1632     }
1633  }
1634 
1635   // Collected all the move operations
1636   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1637     if (src.first() == dst.first()) return;
1638     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1639   }
1640 
1641   // Walk the edges breaking cycles between moves.  The result list
1642   // can be walked in order to produce the proper set of loads
1643   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1644     // Record which moves kill which values
1645     GrowableArray<MoveOperation*> killer;
1646     for (int i = 0; i < edges.length(); i++) {
1647       MoveOperation* s = edges.at(i);
1648       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1649       killer.at_put_grow(s->dst_id(), s, NULL);
1650     }
1651     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1652            "make sure temp isn't in the registers that are killed");
1653 
1654     // create links between loads and stores
1655     for (int i = 0; i < edges.length(); i++) {
1656       edges.at(i)->link(killer);
1657     }
1658 
1659     // at this point, all the move operations are chained together
1660     // in a doubly linked list.  Processing it backwards finds
1661     // the beginning of the chain, forwards finds the end.  If there's
1662     // a cycle it can be broken at any point,  so pick an edge and walk
1663     // backward until the list ends or we end where we started.
1664     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1665     for (int e = 0; e < edges.length(); e++) {
1666       MoveOperation* s = edges.at(e);
1667       if (!s->is_processed()) {
1668         MoveOperation* start = s;
1669         // search for the beginning of the chain or cycle
1670         while (start->prev() != NULL && start->prev() != s) {
1671           start = start->prev();
1672         }
1673         if (start->prev() == s) {
1674           start->break_cycle(temp_register);
1675         }
1676         // walk the chain forward inserting to store list
1677         while (start != NULL) {
1678           stores->append(start);
1679           start->set_processed();
1680           start = start->next();
1681         }
1682       }
1683     }
1684     return stores;
1685   }
1686 };
1687 
1688 static void verify_oop_args(MacroAssembler* masm,
1689                             const methodHandle& method,
1690                             const BasicType* sig_bt,
1691                             const VMRegPair* regs) {
1692   Register temp_reg = rbx;  // not part of any compiled calling seq
1693   if (VerifyOops) {
1694     for (int i = 0; i < method->size_of_parameters(); i++) {
1695       if (is_reference_type(sig_bt[i])) {
1696         VMReg r = regs[i].first();
1697         assert(r->is_valid(), "bad oop arg");
1698         if (r->is_stack()) {
1699           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1700           __ verify_oop(temp_reg);
1701         } else {
1702           __ verify_oop(r->as_Register());
1703         }
1704       }
1705     }
1706   }
1707 }
1708 
1709 static void gen_special_dispatch(MacroAssembler* masm,
1710                                  const methodHandle& method,
1711                                  const BasicType* sig_bt,
1712                                  const VMRegPair* regs) {
1713   verify_oop_args(masm, method, sig_bt, regs);
1714   vmIntrinsics::ID iid = method->intrinsic_id();
1715 
1716   // Now write the args into the outgoing interpreter space
1717   bool     has_receiver   = false;
1718   Register receiver_reg   = noreg;
1719   int      member_arg_pos = -1;
1720   Register member_reg     = noreg;
1721   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1722   if (ref_kind != 0) {
1723     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1724     member_reg = rbx;  // known to be free at this point
1725     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1726   } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1727     has_receiver = true;
1728   } else {
1729     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1730   }
1731 
1732   if (member_reg != noreg) {
1733     // Load the member_arg into register, if necessary.
1734     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1735     VMReg r = regs[member_arg_pos].first();
1736     if (r->is_stack()) {
1737       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1738     } else {
1739       // no data motion is needed
1740       member_reg = r->as_Register();
1741     }
1742   }
1743 
1744   if (has_receiver) {
1745     // Make sure the receiver is loaded into a register.
1746     assert(method->size_of_parameters() > 0, "oob");
1747     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1748     VMReg r = regs[0].first();
1749     assert(r->is_valid(), "bad receiver arg");
1750     if (r->is_stack()) {
1751       // Porting note:  This assumes that compiled calling conventions always
1752       // pass the receiver oop in a register.  If this is not true on some
1753       // platform, pick a temp and load the receiver from stack.
1754       fatal("receiver always in a register");
1755       receiver_reg = j_rarg0;  // known to be free at this point
1756       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1757     } else {
1758       // no data motion is needed
1759       receiver_reg = r->as_Register();
1760     }
1761   }
1762 
1763   // Figure out which address we are really jumping to:
1764   MethodHandles::generate_method_handle_dispatch(masm, iid,
1765                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1766 }
1767 
1768 // ---------------------------------------------------------------------------
1769 // Generate a native wrapper for a given method.  The method takes arguments
1770 // in the Java compiled code convention, marshals them to the native
1771 // convention (handlizes oops, etc), transitions to native, makes the call,
1772 // returns to java state (possibly blocking), unhandlizes any result and
1773 // returns.
1774 //
1775 // Critical native functions are a shorthand for the use of
1776 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1777 // functions.  The wrapper is expected to unpack the arguments before
1778 // passing them to the callee. Critical native functions leave the state _in_Java,
1779 // since they cannot stop for GC.
1780 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1781 // block and the check for pending exceptions it's impossible for them
1782 // to be thrown.
1783 //
1784 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1785                                                 const methodHandle& method,
1786                                                 int compile_id,
1787                                                 BasicType* in_sig_bt,
1788                                                 VMRegPair* in_regs,
1789                                                 BasicType ret_type) {
1790   if (method->is_method_handle_intrinsic()) {
1791     vmIntrinsics::ID iid = method->intrinsic_id();
1792     intptr_t start = (intptr_t)__ pc();
1793     int vep_offset = ((intptr_t)__ pc()) - start;
1794     gen_special_dispatch(masm,
1795                          method,
1796                          in_sig_bt,
1797                          in_regs);
1798     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1799     __ flush();
1800     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1801     return nmethod::new_native_nmethod(method,
1802                                        compile_id,
1803                                        masm->code(),
1804                                        vep_offset,
1805                                        frame_complete,
1806                                        stack_slots / VMRegImpl::slots_per_word,
1807                                        in_ByteSize(-1),
1808                                        in_ByteSize(-1),
1809                                        (OopMapSet*)NULL);
1810   }
1811   address native_func = method->native_function();
1812   assert(native_func != NULL, "must have function");
1813 
1814   // An OopMap for lock (and class if static)
1815   OopMapSet *oop_maps = new OopMapSet();
1816   intptr_t start = (intptr_t)__ pc();
1817 
1818   // We have received a description of where all the java arg are located
1819   // on entry to the wrapper. We need to convert these args to where
1820   // the jni function will expect them. To figure out where they go
1821   // we convert the java signature to a C signature by inserting
1822   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1823 
1824   const int total_in_args = method->size_of_parameters();
1825   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1826 
1827   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1828   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1829   BasicType* in_elem_bt = NULL;
1830 
1831   int argc = 0;
1832   out_sig_bt[argc++] = T_ADDRESS;
1833   if (method->is_static()) {
1834     out_sig_bt[argc++] = T_OBJECT;
1835   }
1836 
1837   for (int i = 0; i < total_in_args ; i++ ) {
1838     out_sig_bt[argc++] = in_sig_bt[i];
1839   }
1840 
1841   // Now figure out where the args must be stored and how much stack space
1842   // they require.
1843   int out_arg_slots;
1844   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1845 
1846   // Compute framesize for the wrapper.  We need to handlize all oops in
1847   // incoming registers
1848 
1849   // Calculate the total number of stack slots we will need.
1850 
1851   // First count the abi requirement plus all of the outgoing args
1852   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1853 
1854   // Now the space for the inbound oop handle area
1855   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1856 
1857   int oop_handle_offset = stack_slots;
1858   stack_slots += total_save_slots;
1859 
1860   // Now any space we need for handlizing a klass if static method
1861 
1862   int klass_slot_offset = 0;
1863   int klass_offset = -1;
1864   int lock_slot_offset = 0;
1865   bool is_static = false;
1866 
1867   if (method->is_static()) {
1868     klass_slot_offset = stack_slots;
1869     stack_slots += VMRegImpl::slots_per_word;
1870     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1871     is_static = true;
1872   }
1873 
1874   // Plus a lock if needed
1875 
1876   if (method->is_synchronized()) {
1877     lock_slot_offset = stack_slots;
1878     stack_slots += VMRegImpl::slots_per_word;
1879   }
1880 
1881   // Now a place (+2) to save return values or temp during shuffling
1882   // + 4 for return address (which we own) and saved rbp
1883   stack_slots += 6;
1884 
1885   // Ok The space we have allocated will look like:
1886   //
1887   //
1888   // FP-> |                     |
1889   //      |---------------------|
1890   //      | 2 slots for moves   |
1891   //      |---------------------|
1892   //      | lock box (if sync)  |
1893   //      |---------------------| <- lock_slot_offset
1894   //      | klass (if static)   |
1895   //      |---------------------| <- klass_slot_offset
1896   //      | oopHandle area      |
1897   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1898   //      | outbound memory     |
1899   //      | based arguments     |
1900   //      |                     |
1901   //      |---------------------|
1902   //      |                     |
1903   // SP-> | out_preserved_slots |
1904   //
1905   //
1906 
1907 
1908   // Now compute actual number of stack words we need rounding to make
1909   // stack properly aligned.
1910   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1911 
1912   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1913 
1914   // First thing make an ic check to see if we should even be here
1915 
1916   // We are free to use all registers as temps without saving them and
1917   // restoring them except rbp. rbp is the only callee save register
1918   // as far as the interpreter and the compiler(s) are concerned.
1919 
1920 
1921   const Register ic_reg = rax;
1922   const Register receiver = j_rarg0;
1923 
1924   Label hit;
1925   Label exception_pending;
1926 
1927   assert_different_registers(ic_reg, receiver, rscratch1);
1928   __ verify_oop(receiver);
1929   __ load_klass(rscratch1, receiver, rscratch2);
1930   __ cmpq(ic_reg, rscratch1);
1931   __ jcc(Assembler::equal, hit);
1932 
1933   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1934 
1935   // Verified entry point must be aligned
1936   __ align(8);
1937 
1938   __ bind(hit);
1939 
1940   int vep_offset = ((intptr_t)__ pc()) - start;
1941 
1942   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1943     Label L_skip_barrier;
1944     Register klass = r10;
1945     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1946     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1947 
1948     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1949 
1950     __ bind(L_skip_barrier);
1951   }
1952 
1953 #ifdef COMPILER1
1954   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1955   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1956     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1957   }
1958 #endif // COMPILER1
1959 
1960   // The instruction at the verified entry point must be 5 bytes or longer
1961   // because it can be patched on the fly by make_non_entrant. The stack bang
1962   // instruction fits that requirement.
1963 
1964   // Generate stack overflow check
1965   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1966 
1967   // Generate a new frame for the wrapper.
1968   __ enter();
1969   // -2 because return address is already present and so is saved rbp
1970   __ subptr(rsp, stack_size - 2*wordSize);
1971 
1972   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1973   bs->nmethod_entry_barrier(masm);
1974 
1975   // Frame is now completed as far as size and linkage.
1976   int frame_complete = ((intptr_t)__ pc()) - start;
1977 
1978     if (UseRTMLocking) {
1979       // Abort RTM transaction before calling JNI
1980       // because critical section will be large and will be
1981       // aborted anyway. Also nmethod could be deoptimized.
1982       __ xabort(0);
1983     }
1984 
1985 #ifdef ASSERT
1986     {
1987       Label L;
1988       __ mov(rax, rsp);
1989       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1990       __ cmpptr(rax, rsp);
1991       __ jcc(Assembler::equal, L);
1992       __ stop("improperly aligned stack");
1993       __ bind(L);
1994     }
1995 #endif /* ASSERT */
1996 
1997 
1998   // We use r14 as the oop handle for the receiver/klass
1999   // It is callee save so it survives the call to native
2000 
2001   const Register oop_handle_reg = r14;
2002 
2003   //
2004   // We immediately shuffle the arguments so that any vm call we have to
2005   // make from here on out (sync slow path, jvmti, etc.) we will have
2006   // captured the oops from our caller and have a valid oopMap for
2007   // them.
2008 
2009   // -----------------
2010   // The Grand Shuffle
2011 
2012   // The Java calling convention is either equal (linux) or denser (win64) than the
2013   // c calling convention. However the because of the jni_env argument the c calling
2014   // convention always has at least one more (and two for static) arguments than Java.
2015   // Therefore if we move the args from java -> c backwards then we will never have
2016   // a register->register conflict and we don't have to build a dependency graph
2017   // and figure out how to break any cycles.
2018   //
2019 
2020   // Record esp-based slot for receiver on stack for non-static methods
2021   int receiver_offset = -1;
2022 
2023   // This is a trick. We double the stack slots so we can claim
2024   // the oops in the caller's frame. Since we are sure to have
2025   // more args than the caller doubling is enough to make
2026   // sure we can capture all the incoming oop args from the
2027   // caller.
2028   //
2029   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2030 
2031   // Mark location of rbp (someday)
2032   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2033 
2034   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2035   // All inbound args are referenced based on rbp and all outbound args via rsp.
2036 
2037 
2038 #ifdef ASSERT
2039   bool reg_destroyed[RegisterImpl::number_of_registers];
2040   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
2041   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
2042     reg_destroyed[r] = false;
2043   }
2044   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
2045     freg_destroyed[f] = false;
2046   }
2047 
2048 #endif /* ASSERT */
2049 
2050   // For JNI natives the incoming and outgoing registers are offset upwards.
2051   GrowableArray<int> arg_order(2 * total_in_args);
2052 
2053   VMRegPair tmp_vmreg;
2054   tmp_vmreg.set2(rbx->as_VMReg());
2055 
2056   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2057     arg_order.push(i);
2058     arg_order.push(c_arg);
2059   }
2060 
2061   int temploc = -1;
2062   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2063     int i = arg_order.at(ai);
2064     int c_arg = arg_order.at(ai + 1);
2065     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2066 #ifdef ASSERT
2067     if (in_regs[i].first()->is_Register()) {
2068       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2069     } else if (in_regs[i].first()->is_XMMRegister()) {
2070       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2071     }
2072     if (out_regs[c_arg].first()->is_Register()) {
2073       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2074     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2075       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2076     }
2077 #endif /* ASSERT */
2078     switch (in_sig_bt[i]) {
2079       case T_ARRAY:
2080       case T_PRIMITIVE_OBJECT:
2081       case T_OBJECT:
2082         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2083                     ((i == 0) && (!is_static)),
2084                     &receiver_offset);
2085         break;
2086       case T_VOID:
2087         break;
2088 
2089       case T_FLOAT:
2090         __ float_move(in_regs[i], out_regs[c_arg]);
2091           break;
2092 
2093       case T_DOUBLE:
2094         assert( i + 1 < total_in_args &&
2095                 in_sig_bt[i + 1] == T_VOID &&
2096                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2097         __ double_move(in_regs[i], out_regs[c_arg]);
2098         break;
2099 
2100       case T_LONG :
2101         __ long_move(in_regs[i], out_regs[c_arg]);
2102         break;
2103 
2104       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2105 
2106       default:
2107         __ move32_64(in_regs[i], out_regs[c_arg]);
2108     }
2109   }
2110 
2111   int c_arg;
2112 
2113   // Pre-load a static method's oop into r14.  Used both by locking code and
2114   // the normal JNI call code.
2115   // point c_arg at the first arg that is already loaded in case we
2116   // need to spill before we call out
2117   c_arg = total_c_args - total_in_args;
2118 
2119   if (method->is_static()) {
2120 
2121     //  load oop into a register
2122     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2123 
2124     // Now handlize the static class mirror it's known not-null.
2125     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2126     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2127 
2128     // Now get the handle
2129     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2130     // store the klass handle as second argument
2131     __ movptr(c_rarg1, oop_handle_reg);
2132     // and protect the arg if we must spill
2133     c_arg--;
2134   }
2135 
2136   // Change state to native (we save the return address in the thread, since it might not
2137   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2138   // points into the right code segment. It does not have to be the correct return pc.
2139   // We use the same pc/oopMap repeatedly when we call out
2140 
2141   intptr_t the_pc = (intptr_t) __ pc();
2142   oop_maps->add_gc_map(the_pc - start, map);
2143 
2144   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
2145 
2146 
2147   // We have all of the arguments setup at this point. We must not touch any register
2148   // argument registers at this point (what if we save/restore them there are no oop?
2149 
2150   {
2151     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2152     // protect the args we've loaded
2153     save_args(masm, total_c_args, c_arg, out_regs);
2154     __ mov_metadata(c_rarg1, method());
2155     __ call_VM_leaf(
2156       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2157       r15_thread, c_rarg1);
2158     restore_args(masm, total_c_args, c_arg, out_regs);
2159   }
2160 
2161   // RedefineClasses() tracing support for obsolete method entry
2162   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2163     // protect the args we've loaded
2164     save_args(masm, total_c_args, c_arg, out_regs);
2165     __ mov_metadata(c_rarg1, method());
2166     __ call_VM_leaf(
2167       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2168       r15_thread, c_rarg1);
2169     restore_args(masm, total_c_args, c_arg, out_regs);
2170   }
2171 
2172   // Lock a synchronized method
2173 
2174   // Register definitions used by locking and unlocking
2175 
2176   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2177   const Register obj_reg  = rbx;  // Will contain the oop
2178   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2179   const Register old_hdr  = r13;  // value of old header at unlock time
2180 
2181   Label slow_path_lock;
2182   Label lock_done;
2183 
2184   if (method->is_synchronized()) {
2185 
2186     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2187 
2188     // Get the handle (the 2nd argument)
2189     __ mov(oop_handle_reg, c_rarg1);
2190 
2191     // Get address of the box
2192 
2193     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2194 
2195     // Load the oop from the handle
2196     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2197 
2198     if (!UseHeavyMonitors) {
2199       // Load immediate 1 into swap_reg %rax
2200       __ movl(swap_reg, 1);
2201 
2202       // Load (object->mark() | 1) into swap_reg %rax
2203       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2204       if (EnableValhalla) {
2205         // Mask inline_type bit such that we go to the slow path if object is an inline type
2206         __ andptr(swap_reg, ~((int) markWord::inline_type_bit_in_place));
2207       }
2208 
2209       // Save (object->mark() | 1) into BasicLock's displaced header
2210       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2211 
2212       // src -> dest iff dest == rax else rax <- dest
2213       __ lock();
2214       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2215       __ jcc(Assembler::equal, lock_done);
2216 
2217       // Hmm should this move to the slow path code area???
2218 
2219       // Test if the oopMark is an obvious stack pointer, i.e.,
2220       //  1) (mark & 3) == 0, and
2221       //  2) rsp <= mark < mark + os::pagesize()
2222       // These 3 tests can be done by evaluating the following
2223       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2224       // assuming both stack pointer and pagesize have their
2225       // least significant 2 bits clear.
2226       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2227 
2228       __ subptr(swap_reg, rsp);
2229       __ andptr(swap_reg, 3 - os::vm_page_size());
2230 
2231       // Save the test result, for recursive case, the result is zero
2232       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2233       __ jcc(Assembler::notEqual, slow_path_lock);
2234     } else {
2235       __ jmp(slow_path_lock);
2236     }
2237 
2238     // Slow path will re-enter here
2239 
2240     __ bind(lock_done);
2241   }
2242 
2243   // Finally just about ready to make the JNI call
2244 
2245   // get JNIEnv* which is first argument to native
2246   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2247 
2248   // Now set thread in native
2249   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2250 
2251   __ call(RuntimeAddress(native_func));
2252 
2253   // Verify or restore cpu control state after JNI call
2254   __ restore_cpu_control_state_after_jni();
2255 
2256   // Unpack native results.
2257   switch (ret_type) {
2258   case T_BOOLEAN: __ c2bool(rax);            break;
2259   case T_CHAR   : __ movzwl(rax, rax);      break;
2260   case T_BYTE   : __ sign_extend_byte (rax); break;
2261   case T_SHORT  : __ sign_extend_short(rax); break;
2262   case T_INT    : /* nothing to do */        break;
2263   case T_DOUBLE :
2264   case T_FLOAT  :
2265     // Result is in xmm0 we'll save as needed
2266     break;
2267   case T_ARRAY:                 // Really a handle
2268   case T_PRIMITIVE_OBJECT:           // Really a handle
2269   case T_OBJECT:                // Really a handle
2270       break; // can't de-handlize until after safepoint check
2271   case T_VOID: break;
2272   case T_LONG: break;
2273   default       : ShouldNotReachHere();
2274   }
2275 
2276   Label after_transition;
2277 
2278   // Switch thread to "native transition" state before reading the synchronization state.
2279   // This additional state is necessary because reading and testing the synchronization
2280   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2281   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2282   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2283   //     Thread A is resumed to finish this native method, but doesn't block here since it
2284   //     didn't see any synchronization is progress, and escapes.
2285   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2286 
2287   // Force this write out before the read below
2288   __ membar(Assembler::Membar_mask_bits(
2289               Assembler::LoadLoad | Assembler::LoadStore |
2290               Assembler::StoreLoad | Assembler::StoreStore));
2291 
2292   // check for safepoint operation in progress and/or pending suspend requests
2293   {
2294     Label Continue;
2295     Label slow_path;
2296 
2297     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2298 
2299     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2300     __ jcc(Assembler::equal, Continue);
2301     __ bind(slow_path);
2302 
2303     // Don't use call_VM as it will see a possible pending exception and forward it
2304     // and never return here preventing us from clearing _last_native_pc down below.
2305     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2306     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2307     // by hand.
2308     //
2309     __ vzeroupper();
2310     save_native_result(masm, ret_type, stack_slots);
2311     __ mov(c_rarg0, r15_thread);
2312     __ mov(r12, rsp); // remember sp
2313     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2314     __ andptr(rsp, -16); // align stack as required by ABI
2315     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2316     __ mov(rsp, r12); // restore sp
2317     __ reinit_heapbase();
2318     // Restore any method result value
2319     restore_native_result(masm, ret_type, stack_slots);
2320     __ bind(Continue);
2321   }
2322 
2323   // change thread state
2324   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2325   __ bind(after_transition);
2326 
2327   Label reguard;
2328   Label reguard_done;
2329   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2330   __ jcc(Assembler::equal, reguard);
2331   __ bind(reguard_done);
2332 
2333   // native result if any is live
2334 
2335   // Unlock
2336   Label unlock_done;
2337   Label slow_path_unlock;
2338   if (method->is_synchronized()) {
2339 
2340     // Get locked oop from the handle we passed to jni
2341     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2342 
2343     Label done;
2344 
2345     if (!UseHeavyMonitors) {
2346       // Simple recursive lock?
2347       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2348       __ jcc(Assembler::equal, done);
2349     }
2350 
2351     // Must save rax if it is live now because cmpxchg must use it
2352     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2353       save_native_result(masm, ret_type, stack_slots);
2354     }
2355 
2356 
2357     if (!UseHeavyMonitors) {
2358       // get address of the stack lock
2359       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2360       //  get old displaced header
2361       __ movptr(old_hdr, Address(rax, 0));
2362 
2363       // Atomic swap old header if oop still contains the stack lock
2364       __ lock();
2365       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2366       __ jcc(Assembler::notEqual, slow_path_unlock);
2367     } else {
2368       __ jmp(slow_path_unlock);
2369     }
2370 
2371     // slow path re-enters here
2372     __ bind(unlock_done);
2373     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2374       restore_native_result(masm, ret_type, stack_slots);
2375     }
2376 
2377     __ bind(done);
2378 
2379   }
2380   {
2381     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2382     save_native_result(masm, ret_type, stack_slots);
2383     __ mov_metadata(c_rarg1, method());
2384     __ call_VM_leaf(
2385          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2386          r15_thread, c_rarg1);
2387     restore_native_result(masm, ret_type, stack_slots);
2388   }
2389 
2390   __ reset_last_Java_frame(false);
2391 
2392   // Unbox oop result, e.g. JNIHandles::resolve value.
2393   if (is_reference_type(ret_type)) {
2394     __ resolve_jobject(rax /* value */,
2395                        r15_thread /* thread */,
2396                        rcx /* tmp */);
2397   }
2398 
2399   if (CheckJNICalls) {
2400     // clear_pending_jni_exception_check
2401     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2402   }
2403 
2404   // reset handle block
2405   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2406   __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2407 
2408   // pop our frame
2409 
2410   __ leave();
2411 
2412   // Any exception pending?
2413   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2414   __ jcc(Assembler::notEqual, exception_pending);
2415 
2416   // Return
2417 
2418   __ ret(0);
2419 
2420   // Unexpected paths are out of line and go here
2421 
2422   // forward the exception
2423   __ bind(exception_pending);
2424 
2425   // and forward the exception
2426   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2427 
2428   // Slow path locking & unlocking
2429   if (method->is_synchronized()) {
2430 
2431     // BEGIN Slow path lock
2432     __ bind(slow_path_lock);
2433 
2434     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2435     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2436 
2437     // protect the args we've loaded
2438     save_args(masm, total_c_args, c_arg, out_regs);
2439 
2440     __ mov(c_rarg0, obj_reg);
2441     __ mov(c_rarg1, lock_reg);
2442     __ mov(c_rarg2, r15_thread);
2443 
2444     // Not a leaf but we have last_Java_frame setup as we want
2445     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2446     restore_args(masm, total_c_args, c_arg, out_regs);
2447 
2448 #ifdef ASSERT
2449     { Label L;
2450     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2451     __ jcc(Assembler::equal, L);
2452     __ stop("no pending exception allowed on exit from monitorenter");
2453     __ bind(L);
2454     }
2455 #endif
2456     __ jmp(lock_done);
2457 
2458     // END Slow path lock
2459 
2460     // BEGIN Slow path unlock
2461     __ bind(slow_path_unlock);
2462 
2463     // If we haven't already saved the native result we must save it now as xmm registers
2464     // are still exposed.
2465     __ vzeroupper();
2466     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2467       save_native_result(masm, ret_type, stack_slots);
2468     }
2469 
2470     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2471 
2472     __ mov(c_rarg0, obj_reg);
2473     __ mov(c_rarg2, r15_thread);
2474     __ mov(r12, rsp); // remember sp
2475     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2476     __ andptr(rsp, -16); // align stack as required by ABI
2477 
2478     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2479     // NOTE that obj_reg == rbx currently
2480     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2481     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2482 
2483     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2484     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2485     __ mov(rsp, r12); // restore sp
2486     __ reinit_heapbase();
2487 #ifdef ASSERT
2488     {
2489       Label L;
2490       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2491       __ jcc(Assembler::equal, L);
2492       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2493       __ bind(L);
2494     }
2495 #endif /* ASSERT */
2496 
2497     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2498 
2499     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2500       restore_native_result(masm, ret_type, stack_slots);
2501     }
2502     __ jmp(unlock_done);
2503 
2504     // END Slow path unlock
2505 
2506   } // synchronized
2507 
2508   // SLOW PATH Reguard the stack if needed
2509 
2510   __ bind(reguard);
2511   __ vzeroupper();
2512   save_native_result(masm, ret_type, stack_slots);
2513   __ mov(r12, rsp); // remember sp
2514   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2515   __ andptr(rsp, -16); // align stack as required by ABI
2516   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2517   __ mov(rsp, r12); // restore sp
2518   __ reinit_heapbase();
2519   restore_native_result(masm, ret_type, stack_slots);
2520   // and continue
2521   __ jmp(reguard_done);
2522 
2523 
2524 
2525   __ flush();
2526 
2527   nmethod *nm = nmethod::new_native_nmethod(method,
2528                                             compile_id,
2529                                             masm->code(),
2530                                             vep_offset,
2531                                             frame_complete,
2532                                             stack_slots / VMRegImpl::slots_per_word,
2533                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2534                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2535                                             oop_maps);
2536 
2537   return nm;
2538 }
2539 
2540 // this function returns the adjust size (in number of words) to a c2i adapter
2541 // activation for use during deoptimization
2542 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2543   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2544 }
2545 
2546 
2547 uint SharedRuntime::out_preserve_stack_slots() {
2548   return 0;
2549 }
2550 
2551 
2552 // Number of stack slots between incoming argument block and the start of
2553 // a new frame.  The PROLOG must add this many slots to the stack.  The
2554 // EPILOG must remove this many slots.  amd64 needs two slots for
2555 // return address.
2556 uint SharedRuntime::in_preserve_stack_slots() {
2557   return 4 + 2 * VerifyStackAtCalls;
2558 }
2559 
2560 //------------------------------generate_deopt_blob----------------------------
2561 void SharedRuntime::generate_deopt_blob() {
2562   // Allocate space for the code
2563   ResourceMark rm;
2564   // Setup code generation tools
2565   int pad = 0;
2566   if (UseAVX > 2) {
2567     pad += 1024;
2568   }
2569 #if INCLUDE_JVMCI
2570   if (EnableJVMCI) {
2571     pad += 512; // Increase the buffer size when compiling for JVMCI
2572   }
2573 #endif
2574   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2575   MacroAssembler* masm = new MacroAssembler(&buffer);
2576   int frame_size_in_words;
2577   OopMap* map = NULL;
2578   OopMapSet *oop_maps = new OopMapSet();
2579 
2580   // -------------
2581   // This code enters when returning to a de-optimized nmethod.  A return
2582   // address has been pushed on the the stack, and return values are in
2583   // registers.
2584   // If we are doing a normal deopt then we were called from the patched
2585   // nmethod from the point we returned to the nmethod. So the return
2586   // address on the stack is wrong by NativeCall::instruction_size
2587   // We will adjust the value so it looks like we have the original return
2588   // address on the stack (like when we eagerly deoptimized).
2589   // In the case of an exception pending when deoptimizing, we enter
2590   // with a return address on the stack that points after the call we patched
2591   // into the exception handler. We have the following register state from,
2592   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2593   //    rax: exception oop
2594   //    rbx: exception handler
2595   //    rdx: throwing pc
2596   // So in this case we simply jam rdx into the useless return address and
2597   // the stack looks just like we want.
2598   //
2599   // At this point we need to de-opt.  We save the argument return
2600   // registers.  We call the first C routine, fetch_unroll_info().  This
2601   // routine captures the return values and returns a structure which
2602   // describes the current frame size and the sizes of all replacement frames.
2603   // The current frame is compiled code and may contain many inlined
2604   // functions, each with their own JVM state.  We pop the current frame, then
2605   // push all the new frames.  Then we call the C routine unpack_frames() to
2606   // populate these frames.  Finally unpack_frames() returns us the new target
2607   // address.  Notice that callee-save registers are BLOWN here; they have
2608   // already been captured in the vframeArray at the time the return PC was
2609   // patched.
2610   address start = __ pc();
2611   Label cont;
2612 
2613   // Prolog for non exception case!
2614 
2615   // Save everything in sight.
2616   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2617 
2618   // Normal deoptimization.  Save exec mode for unpack_frames.
2619   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2620   __ jmp(cont);
2621 
2622   int reexecute_offset = __ pc() - start;
2623 #if INCLUDE_JVMCI && !defined(COMPILER1)
2624   if (EnableJVMCI && UseJVMCICompiler) {
2625     // JVMCI does not use this kind of deoptimization
2626     __ should_not_reach_here();
2627   }
2628 #endif
2629 
2630   // Reexecute case
2631   // return address is the pc describes what bci to do re-execute at
2632 
2633   // No need to update map as each call to save_live_registers will produce identical oopmap
2634   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2635 
2636   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2637   __ jmp(cont);
2638 
2639 #if INCLUDE_JVMCI
2640   Label after_fetch_unroll_info_call;
2641   int implicit_exception_uncommon_trap_offset = 0;
2642   int uncommon_trap_offset = 0;
2643 
2644   if (EnableJVMCI) {
2645     implicit_exception_uncommon_trap_offset = __ pc() - start;
2646 
2647     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2648     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2649 
2650     uncommon_trap_offset = __ pc() - start;
2651 
2652     // Save everything in sight.
2653     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2654     // fetch_unroll_info needs to call last_java_frame()
2655     __ set_last_Java_frame(noreg, noreg, NULL);
2656 
2657     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2658     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2659 
2660     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2661     __ mov(c_rarg0, r15_thread);
2662     __ movl(c_rarg2, r14); // exec mode
2663     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2664     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2665 
2666     __ reset_last_Java_frame(false);
2667 
2668     __ jmp(after_fetch_unroll_info_call);
2669   } // EnableJVMCI
2670 #endif // INCLUDE_JVMCI
2671 
2672   int exception_offset = __ pc() - start;
2673 
2674   // Prolog for exception case
2675 
2676   // all registers are dead at this entry point, except for rax, and
2677   // rdx which contain the exception oop and exception pc
2678   // respectively.  Set them in TLS and fall thru to the
2679   // unpack_with_exception_in_tls entry point.
2680 
2681   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2682   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2683 
2684   int exception_in_tls_offset = __ pc() - start;
2685 
2686   // new implementation because exception oop is now passed in JavaThread
2687 
2688   // Prolog for exception case
2689   // All registers must be preserved because they might be used by LinearScan
2690   // Exceptiop oop and throwing PC are passed in JavaThread
2691   // tos: stack at point of call to method that threw the exception (i.e. only
2692   // args are on the stack, no return address)
2693 
2694   // make room on stack for the return address
2695   // It will be patched later with the throwing pc. The correct value is not
2696   // available now because loading it from memory would destroy registers.
2697   __ push(0);
2698 
2699   // Save everything in sight.
2700   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2701 
2702   // Now it is safe to overwrite any register
2703 
2704   // Deopt during an exception.  Save exec mode for unpack_frames.
2705   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2706 
2707   // load throwing pc from JavaThread and patch it as the return address
2708   // of the current frame. Then clear the field in JavaThread
2709 
2710   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2711   __ movptr(Address(rbp, wordSize), rdx);
2712   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2713 
2714 #ifdef ASSERT
2715   // verify that there is really an exception oop in JavaThread
2716   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2717   __ verify_oop(rax);
2718 
2719   // verify that there is no pending exception
2720   Label no_pending_exception;
2721   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2722   __ testptr(rax, rax);
2723   __ jcc(Assembler::zero, no_pending_exception);
2724   __ stop("must not have pending exception here");
2725   __ bind(no_pending_exception);
2726 #endif
2727 
2728   __ bind(cont);
2729 
2730   // Call C code.  Need thread and this frame, but NOT official VM entry
2731   // crud.  We cannot block on this call, no GC can happen.
2732   //
2733   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2734 
2735   // fetch_unroll_info needs to call last_java_frame().
2736 
2737   __ set_last_Java_frame(noreg, noreg, NULL);
2738 #ifdef ASSERT
2739   { Label L;
2740     __ cmpptr(Address(r15_thread,
2741                     JavaThread::last_Java_fp_offset()),
2742             (int32_t)0);
2743     __ jcc(Assembler::equal, L);
2744     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2745     __ bind(L);
2746   }
2747 #endif // ASSERT
2748   __ mov(c_rarg0, r15_thread);
2749   __ movl(c_rarg1, r14); // exec_mode
2750   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2751 
2752   // Need to have an oopmap that tells fetch_unroll_info where to
2753   // find any register it might need.
2754   oop_maps->add_gc_map(__ pc() - start, map);
2755 
2756   __ reset_last_Java_frame(false);
2757 
2758 #if INCLUDE_JVMCI
2759   if (EnableJVMCI) {
2760     __ bind(after_fetch_unroll_info_call);
2761   }
2762 #endif
2763 
2764   // Load UnrollBlock* into rdi
2765   __ mov(rdi, rax);
2766 
2767   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2768    Label noException;
2769   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2770   __ jcc(Assembler::notEqual, noException);
2771   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2772   // QQQ this is useless it was NULL above
2773   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2774   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2775   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2776 
2777   __ verify_oop(rax);
2778 
2779   // Overwrite the result registers with the exception results.
2780   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2781   // I think this is useless
2782   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2783 
2784   __ bind(noException);
2785 
2786   // Only register save data is on the stack.
2787   // Now restore the result registers.  Everything else is either dead
2788   // or captured in the vframeArray.
2789   RegisterSaver::restore_result_registers(masm);
2790 
2791   // All of the register save area has been popped of the stack. Only the
2792   // return address remains.
2793 
2794   // Pop all the frames we must move/replace.
2795   //
2796   // Frame picture (youngest to oldest)
2797   // 1: self-frame (no frame link)
2798   // 2: deopting frame  (no frame link)
2799   // 3: caller of deopting frame (could be compiled/interpreted).
2800   //
2801   // Note: by leaving the return address of self-frame on the stack
2802   // and using the size of frame 2 to adjust the stack
2803   // when we are done the return to frame 3 will still be on the stack.
2804 
2805   // Pop deoptimized frame
2806   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2807   __ addptr(rsp, rcx);
2808 
2809   // rsp should be pointing at the return address to the caller (3)
2810 
2811   // Pick up the initial fp we should save
2812   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2813   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2814 
2815 #ifdef ASSERT
2816   // Compilers generate code that bang the stack by as much as the
2817   // interpreter would need. So this stack banging should never
2818   // trigger a fault. Verify that it does not on non product builds.
2819   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2820   __ bang_stack_size(rbx, rcx);
2821 #endif
2822 
2823   // Load address of array of frame pcs into rcx
2824   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2825 
2826   // Trash the old pc
2827   __ addptr(rsp, wordSize);
2828 
2829   // Load address of array of frame sizes into rsi
2830   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2831 
2832   // Load counter into rdx
2833   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2834 
2835   // Now adjust the caller's stack to make up for the extra locals
2836   // but record the original sp so that we can save it in the skeletal interpreter
2837   // frame and the stack walking of interpreter_sender will get the unextended sp
2838   // value and not the "real" sp value.
2839 
2840   const Register sender_sp = r8;
2841 
2842   __ mov(sender_sp, rsp);
2843   __ movl(rbx, Address(rdi,
2844                        Deoptimization::UnrollBlock::
2845                        caller_adjustment_offset_in_bytes()));
2846   __ subptr(rsp, rbx);
2847 
2848   // Push interpreter frames in a loop
2849   Label loop;
2850   __ bind(loop);
2851   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2852   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2853   __ pushptr(Address(rcx, 0));          // Save return address
2854   __ enter();                           // Save old & set new ebp
2855   __ subptr(rsp, rbx);                  // Prolog
2856   // This value is corrected by layout_activation_impl
2857   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2858   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2859   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2860   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2861   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2862   __ decrementl(rdx);                   // Decrement counter
2863   __ jcc(Assembler::notZero, loop);
2864   __ pushptr(Address(rcx, 0));          // Save final return address
2865 
2866   // Re-push self-frame
2867   __ enter();                           // Save old & set new ebp
2868 
2869   // Allocate a full sized register save area.
2870   // Return address and rbp are in place, so we allocate two less words.
2871   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2872 
2873   // Restore frame locals after moving the frame
2874   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2875   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2876 
2877   // Call C code.  Need thread but NOT official VM entry
2878   // crud.  We cannot block on this call, no GC can happen.  Call should
2879   // restore return values to their stack-slots with the new SP.
2880   //
2881   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2882 
2883   // Use rbp because the frames look interpreted now
2884   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2885   // Don't need the precise return PC here, just precise enough to point into this code blob.
2886   address the_pc = __ pc();
2887   __ set_last_Java_frame(noreg, rbp, the_pc);
2888 
2889   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2890   __ mov(c_rarg0, r15_thread);
2891   __ movl(c_rarg1, r14); // second arg: exec_mode
2892   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2893   // Revert SP alignment after call since we're going to do some SP relative addressing below
2894   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2895 
2896   // Set an oopmap for the call site
2897   // Use the same PC we used for the last java frame
2898   oop_maps->add_gc_map(the_pc - start,
2899                        new OopMap( frame_size_in_words, 0 ));
2900 
2901   // Clear fp AND pc
2902   __ reset_last_Java_frame(true);
2903 
2904   // Collect return values
2905   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2906   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2907   // I think this is useless (throwing pc?)
2908   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2909 
2910   // Pop self-frame.
2911   __ leave();                           // Epilog
2912 
2913   // Jump to interpreter
2914   __ ret(0);
2915 
2916   // Make sure all code is generated
2917   masm->flush();
2918 
2919   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2920   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2921 #if INCLUDE_JVMCI
2922   if (EnableJVMCI) {
2923     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2924     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2925   }
2926 #endif
2927 }
2928 
2929 #ifdef COMPILER2
2930 //------------------------------generate_uncommon_trap_blob--------------------
2931 void SharedRuntime::generate_uncommon_trap_blob() {
2932   // Allocate space for the code
2933   ResourceMark rm;
2934   // Setup code generation tools
2935   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2936   MacroAssembler* masm = new MacroAssembler(&buffer);
2937 
2938   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2939 
2940   address start = __ pc();
2941 
2942   if (UseRTMLocking) {
2943     // Abort RTM transaction before possible nmethod deoptimization.
2944     __ xabort(0);
2945   }
2946 
2947   // Push self-frame.  We get here with a return address on the
2948   // stack, so rsp is 8-byte aligned until we allocate our frame.
2949   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2950 
2951   // No callee saved registers. rbp is assumed implicitly saved
2952   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2953 
2954   // compiler left unloaded_class_index in j_rarg0 move to where the
2955   // runtime expects it.
2956   __ movl(c_rarg1, j_rarg0);
2957 
2958   __ set_last_Java_frame(noreg, noreg, NULL);
2959 
2960   // Call C code.  Need thread but NOT official VM entry
2961   // crud.  We cannot block on this call, no GC can happen.  Call should
2962   // capture callee-saved registers as well as return values.
2963   // Thread is in rdi already.
2964   //
2965   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2966 
2967   __ mov(c_rarg0, r15_thread);
2968   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2969   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2970 
2971   // Set an oopmap for the call site
2972   OopMapSet* oop_maps = new OopMapSet();
2973   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2974 
2975   // location of rbp is known implicitly by the frame sender code
2976 
2977   oop_maps->add_gc_map(__ pc() - start, map);
2978 
2979   __ reset_last_Java_frame(false);
2980 
2981   // Load UnrollBlock* into rdi
2982   __ mov(rdi, rax);
2983 
2984 #ifdef ASSERT
2985   { Label L;
2986     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2987             (int32_t)Deoptimization::Unpack_uncommon_trap);
2988     __ jcc(Assembler::equal, L);
2989     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2990     __ bind(L);
2991   }
2992 #endif
2993 
2994   // Pop all the frames we must move/replace.
2995   //
2996   // Frame picture (youngest to oldest)
2997   // 1: self-frame (no frame link)
2998   // 2: deopting frame  (no frame link)
2999   // 3: caller of deopting frame (could be compiled/interpreted).
3000 
3001   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
3002   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
3003 
3004   // Pop deoptimized frame (int)
3005   __ movl(rcx, Address(rdi,
3006                        Deoptimization::UnrollBlock::
3007                        size_of_deoptimized_frame_offset_in_bytes()));
3008   __ addptr(rsp, rcx);
3009 
3010   // rsp should be pointing at the return address to the caller (3)
3011 
3012   // Pick up the initial fp we should save
3013   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3014   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
3015 
3016 #ifdef ASSERT
3017   // Compilers generate code that bang the stack by as much as the
3018   // interpreter would need. So this stack banging should never
3019   // trigger a fault. Verify that it does not on non product builds.
3020   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
3021   __ bang_stack_size(rbx, rcx);
3022 #endif
3023 
3024   // Load address of array of frame pcs into rcx (address*)
3025   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
3026 
3027   // Trash the return pc
3028   __ addptr(rsp, wordSize);
3029 
3030   // Load address of array of frame sizes into rsi (intptr_t*)
3031   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
3032 
3033   // Counter
3034   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
3035 
3036   // Now adjust the caller's stack to make up for the extra locals but
3037   // record the original sp so that we can save it in the skeletal
3038   // interpreter frame and the stack walking of interpreter_sender
3039   // will get the unextended sp value and not the "real" sp value.
3040 
3041   const Register sender_sp = r8;
3042 
3043   __ mov(sender_sp, rsp);
3044   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
3045   __ subptr(rsp, rbx);
3046 
3047   // Push interpreter frames in a loop
3048   Label loop;
3049   __ bind(loop);
3050   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3051   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3052   __ pushptr(Address(rcx, 0));     // Save return address
3053   __ enter();                      // Save old & set new rbp
3054   __ subptr(rsp, rbx);             // Prolog
3055   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3056             sender_sp);            // Make it walkable
3057   // This value is corrected by layout_activation_impl
3058   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
3059   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3060   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3061   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3062   __ decrementl(rdx);              // Decrement counter
3063   __ jcc(Assembler::notZero, loop);
3064   __ pushptr(Address(rcx, 0));     // Save final return address
3065 
3066   // Re-push self-frame
3067   __ enter();                 // Save old & set new rbp
3068   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3069                               // Prolog
3070 
3071   // Use rbp because the frames look interpreted now
3072   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3073   // Don't need the precise return PC here, just precise enough to point into this code blob.
3074   address the_pc = __ pc();
3075   __ set_last_Java_frame(noreg, rbp, the_pc);
3076 
3077   // Call C code.  Need thread but NOT official VM entry
3078   // crud.  We cannot block on this call, no GC can happen.  Call should
3079   // restore return values to their stack-slots with the new SP.
3080   // Thread is in rdi already.
3081   //
3082   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3083 
3084   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3085   __ mov(c_rarg0, r15_thread);
3086   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3087   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3088 
3089   // Set an oopmap for the call site
3090   // Use the same PC we used for the last java frame
3091   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3092 
3093   // Clear fp AND pc
3094   __ reset_last_Java_frame(true);
3095 
3096   // Pop self-frame.
3097   __ leave();                 // Epilog
3098 
3099   // Jump to interpreter
3100   __ ret(0);
3101 
3102   // Make sure all code is generated
3103   masm->flush();
3104 
3105   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3106                                                  SimpleRuntimeFrame::framesize >> 1);
3107 }
3108 #endif // COMPILER2
3109 
3110 //------------------------------generate_handler_blob------
3111 //
3112 // Generate a special Compile2Runtime blob that saves all registers,
3113 // and setup oopmap.
3114 //
3115 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3116   assert(StubRoutines::forward_exception_entry() != NULL,
3117          "must be generated before");
3118 
3119   ResourceMark rm;
3120   OopMapSet *oop_maps = new OopMapSet();
3121   OopMap* map;
3122 
3123   // Allocate space for the code.  Setup code generation tools.
3124   CodeBuffer buffer("handler_blob", 2048, 1024);
3125   MacroAssembler* masm = new MacroAssembler(&buffer);
3126 
3127   address start   = __ pc();
3128   address call_pc = NULL;
3129   int frame_size_in_words;
3130   bool cause_return = (poll_type == POLL_AT_RETURN);
3131   bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3132 
3133   if (UseRTMLocking) {
3134     // Abort RTM transaction before calling runtime
3135     // because critical section will be large and will be
3136     // aborted anyway. Also nmethod could be deoptimized.
3137     __ xabort(0);
3138   }
3139 
3140   // Make room for return address (or push it again)
3141   if (!cause_return) {
3142     __ push(rbx);
3143   }
3144 
3145   // Save registers, fpu state, and flags
3146   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
3147 
3148   // The following is basically a call_VM.  However, we need the precise
3149   // address of the call in order to generate an oopmap. Hence, we do all the
3150   // work outselves.
3151 
3152   __ set_last_Java_frame(noreg, noreg, NULL);
3153 
3154   // The return address must always be correct so that frame constructor never
3155   // sees an invalid pc.
3156 
3157   if (!cause_return) {
3158     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3159     // Additionally, rbx is a callee saved register and we can look at it later to determine
3160     // if someone changed the return address for us!
3161     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3162     __ movptr(Address(rbp, wordSize), rbx);
3163   }
3164 
3165   // Do the call
3166   __ mov(c_rarg0, r15_thread);
3167   __ call(RuntimeAddress(call_ptr));
3168 
3169   // Set an oopmap for the call site.  This oopmap will map all
3170   // oop-registers and debug-info registers as callee-saved.  This
3171   // will allow deoptimization at this safepoint to find all possible
3172   // debug-info recordings, as well as let GC find all oops.
3173 
3174   oop_maps->add_gc_map( __ pc() - start, map);
3175 
3176   Label noException;
3177 
3178   __ reset_last_Java_frame(false);
3179 
3180   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3181   __ jcc(Assembler::equal, noException);
3182 
3183   // Exception pending
3184 
3185   RegisterSaver::restore_live_registers(masm, save_vectors);
3186 
3187   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3188 
3189   // No exception case
3190   __ bind(noException);
3191 
3192   Label no_adjust;
3193 #ifdef ASSERT
3194   Label bail;
3195 #endif
3196   if (!cause_return) {
3197     Label no_prefix, not_special;
3198 
3199     // If our stashed return pc was modified by the runtime we avoid touching it
3200     __ cmpptr(rbx, Address(rbp, wordSize));
3201     __ jccb(Assembler::notEqual, no_adjust);
3202 
3203     // Skip over the poll instruction.
3204     // See NativeInstruction::is_safepoint_poll()
3205     // Possible encodings:
3206     //      85 00       test   %eax,(%rax)
3207     //      85 01       test   %eax,(%rcx)
3208     //      85 02       test   %eax,(%rdx)
3209     //      85 03       test   %eax,(%rbx)
3210     //      85 06       test   %eax,(%rsi)
3211     //      85 07       test   %eax,(%rdi)
3212     //
3213     //   41 85 00       test   %eax,(%r8)
3214     //   41 85 01       test   %eax,(%r9)
3215     //   41 85 02       test   %eax,(%r10)
3216     //   41 85 03       test   %eax,(%r11)
3217     //   41 85 06       test   %eax,(%r14)
3218     //   41 85 07       test   %eax,(%r15)
3219     //
3220     //      85 04 24    test   %eax,(%rsp)
3221     //   41 85 04 24    test   %eax,(%r12)
3222     //      85 45 00    test   %eax,0x0(%rbp)
3223     //   41 85 45 00    test   %eax,0x0(%r13)
3224 
3225     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3226     __ jcc(Assembler::notEqual, no_prefix);
3227     __ addptr(rbx, 1);
3228     __ bind(no_prefix);
3229 #ifdef ASSERT
3230     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3231 #endif
3232     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3233     // r12/rsp 0x04
3234     // r13/rbp 0x05
3235     __ movzbq(rcx, Address(rbx, 1));
3236     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3237     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3238     __ cmpptr(rcx, 1);
3239     __ jcc(Assembler::above, not_special);
3240     __ addptr(rbx, 1);
3241     __ bind(not_special);
3242 #ifdef ASSERT
3243     // Verify the correct encoding of the poll we're about to skip.
3244     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3245     __ jcc(Assembler::notEqual, bail);
3246     // Mask out the modrm bits
3247     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3248     // rax encodes to 0, so if the bits are nonzero it's incorrect
3249     __ jcc(Assembler::notZero, bail);
3250 #endif
3251     // Adjust return pc forward to step over the safepoint poll instruction
3252     __ addptr(rbx, 2);
3253     __ movptr(Address(rbp, wordSize), rbx);
3254   }
3255 
3256   __ bind(no_adjust);
3257   // Normal exit, restore registers and exit.
3258   RegisterSaver::restore_live_registers(masm, save_vectors);
3259   __ ret(0);
3260 
3261 #ifdef ASSERT
3262   __ bind(bail);
3263   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3264 #endif
3265 
3266   // Make sure all code is generated
3267   masm->flush();
3268 
3269   // Fill-out other meta info
3270   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3271 }
3272 
3273 //
3274 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3275 //
3276 // Generate a stub that calls into vm to find out the proper destination
3277 // of a java call. All the argument registers are live at this point
3278 // but since this is generic code we don't know what they are and the caller
3279 // must do any gc of the args.
3280 //
3281 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3282   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3283 
3284   // allocate space for the code
3285   ResourceMark rm;
3286 
3287   CodeBuffer buffer(name, 1200, 512);
3288   MacroAssembler* masm                = new MacroAssembler(&buffer);
3289 
3290   int frame_size_in_words;
3291 
3292   OopMapSet *oop_maps = new OopMapSet();
3293   OopMap* map = NULL;
3294 
3295   int start = __ offset();
3296 
3297   // No need to save vector registers since they are caller-saved anyway.
3298   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
3299 
3300   int frame_complete = __ offset();
3301 
3302   __ set_last_Java_frame(noreg, noreg, NULL);
3303 
3304   __ mov(c_rarg0, r15_thread);
3305 
3306   __ call(RuntimeAddress(destination));
3307 
3308 
3309   // Set an oopmap for the call site.
3310   // We need this not only for callee-saved registers, but also for volatile
3311   // registers that the compiler might be keeping live across a safepoint.
3312 
3313   oop_maps->add_gc_map( __ offset() - start, map);
3314 
3315   // rax contains the address we are going to jump to assuming no exception got installed
3316 
3317   // clear last_Java_sp
3318   __ reset_last_Java_frame(false);
3319   // check for pending exceptions
3320   Label pending;
3321   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3322   __ jcc(Assembler::notEqual, pending);
3323 
3324   // get the returned Method*
3325   __ get_vm_result_2(rbx, r15_thread);
3326   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3327 
3328   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3329 
3330   RegisterSaver::restore_live_registers(masm);
3331 
3332   // We are back the the original state on entry and ready to go.
3333 
3334   __ jmp(rax);
3335 
3336   // Pending exception after the safepoint
3337 
3338   __ bind(pending);
3339 
3340   RegisterSaver::restore_live_registers(masm);
3341 
3342   // exception pending => remove activation and forward to exception handler
3343 
3344   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3345 
3346   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3347   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3348 
3349   // -------------
3350   // make sure all code is generated
3351   masm->flush();
3352 
3353   // return the  blob
3354   // frame_size_words or bytes??
3355   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3356 }
3357 
3358 #ifdef COMPILER2
3359 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3360 
3361 class NativeInvokerGenerator : public StubCodeGenerator {
3362   address _call_target;
3363   int _shadow_space_bytes;
3364 
3365   const GrowableArray<VMReg>& _input_registers;
3366   const GrowableArray<VMReg>& _output_registers;
3367 
3368   int _frame_complete;
3369   int _framesize;
3370   OopMapSet* _oop_maps;
3371 public:
3372   NativeInvokerGenerator(CodeBuffer* buffer,
3373                          address call_target,
3374                          int shadow_space_bytes,
3375                          const GrowableArray<VMReg>& input_registers,
3376                          const GrowableArray<VMReg>& output_registers)
3377    : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3378      _call_target(call_target),
3379      _shadow_space_bytes(shadow_space_bytes),
3380      _input_registers(input_registers),
3381      _output_registers(output_registers),
3382      _frame_complete(0),
3383      _framesize(0),
3384      _oop_maps(NULL) {
3385     assert(_output_registers.length() <= 1
3386            || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3387 
3388   }
3389 
3390   void generate();
3391 
3392   int spill_size_in_bytes() const {
3393     if (_output_registers.length() == 0) {
3394       return 0;
3395     }
3396     VMReg reg = _output_registers.at(0);
3397     assert(reg->is_reg(), "must be a register");
3398     if (reg->is_Register()) {
3399       return 8;
3400     } else if (reg->is_XMMRegister()) {
3401       if (UseAVX >= 3) {
3402         return 64;
3403       } else if (UseAVX >= 1) {
3404         return 32;
3405       } else {
3406         return 16;
3407       }
3408     } else {
3409       ShouldNotReachHere();
3410     }
3411     return 0;
3412   }
3413 
3414   void spill_out_registers() {
3415     if (_output_registers.length() == 0) {
3416       return;
3417     }
3418     VMReg reg = _output_registers.at(0);
3419     assert(reg->is_reg(), "must be a register");
3420     MacroAssembler* masm = _masm;
3421     if (reg->is_Register()) {
3422       __ movptr(Address(rsp, 0), reg->as_Register());
3423     } else if (reg->is_XMMRegister()) {
3424       if (UseAVX >= 3) {
3425         __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3426       } else if (UseAVX >= 1) {
3427         __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3428       } else {
3429         __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3430       }
3431     } else {
3432       ShouldNotReachHere();
3433     }
3434   }
3435 
3436   void fill_out_registers() {
3437     if (_output_registers.length() == 0) {
3438       return;
3439     }
3440     VMReg reg = _output_registers.at(0);
3441     assert(reg->is_reg(), "must be a register");
3442     MacroAssembler* masm = _masm;
3443     if (reg->is_Register()) {
3444       __ movptr(reg->as_Register(), Address(rsp, 0));
3445     } else if (reg->is_XMMRegister()) {
3446       if (UseAVX >= 3) {
3447         __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3448       } else if (UseAVX >= 1) {
3449         __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3450       } else {
3451         __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3452       }
3453     } else {
3454       ShouldNotReachHere();
3455     }
3456   }
3457 
3458   int frame_complete() const {
3459     return _frame_complete;
3460   }
3461 
3462   int framesize() const {
3463     return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3464   }
3465 
3466   OopMapSet* oop_maps() const {
3467     return _oop_maps;
3468   }
3469 
3470 private:
3471 #ifdef ASSERT
3472 bool target_uses_register(VMReg reg) {
3473   return _input_registers.contains(reg) || _output_registers.contains(reg);
3474 }
3475 #endif
3476 };
3477 
3478 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3479                                                 int shadow_space_bytes,
3480                                                 const GrowableArray<VMReg>& input_registers,
3481                                                 const GrowableArray<VMReg>& output_registers) {
3482   int locs_size  = 64;
3483   CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3484   NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3485   g.generate();
3486   code.log_section_sizes("nep_invoker_blob");
3487 
3488   RuntimeStub* stub =
3489     RuntimeStub::new_runtime_stub("nep_invoker_blob",
3490                                   &code,
3491                                   g.frame_complete(),
3492                                   g.framesize(),
3493                                   g.oop_maps(), false);
3494   return stub;
3495 }
3496 
3497 void NativeInvokerGenerator::generate() {
3498   assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3499 
3500   enum layout {
3501     rbp_off,
3502     rbp_off2,
3503     return_off,
3504     return_off2,
3505     framesize // inclusive of return address
3506   };
3507 
3508   _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3509   assert(is_even(_framesize/2), "sp not 16-byte aligned");
3510 
3511   _oop_maps  = new OopMapSet();
3512   MacroAssembler* masm = _masm;
3513 
3514   address start = __ pc();
3515 
3516   __ enter();
3517 
3518   // return address and rbp are already in place
3519   __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3520 
3521   _frame_complete = __ pc() - start;
3522 
3523   address the_pc = __ pc();
3524 
3525   __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3526   OopMap* map = new OopMap(_framesize, 0);
3527   _oop_maps->add_gc_map(the_pc - start, map);
3528 
3529   // State transition
3530   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3531 
3532   __ call(RuntimeAddress(_call_target));
3533 
3534   __ restore_cpu_control_state_after_jni();
3535 
3536   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3537 
3538   // Force this write out before the read below
3539   __ membar(Assembler::Membar_mask_bits(
3540           Assembler::LoadLoad | Assembler::LoadStore |
3541           Assembler::StoreLoad | Assembler::StoreStore));
3542 
3543   Label L_after_safepoint_poll;
3544   Label L_safepoint_poll_slow_path;
3545 
3546   __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3547   __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3548   __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3549 
3550   __ bind(L_after_safepoint_poll);
3551 
3552   // change thread state
3553   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3554 
3555   __ block_comment("reguard stack check");
3556   Label L_reguard;
3557   Label L_after_reguard;
3558   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3559   __ jcc(Assembler::equal, L_reguard);
3560   __ bind(L_after_reguard);
3561 
3562   __ reset_last_Java_frame(r15_thread, true);
3563 
3564   __ leave(); // required for proper stackwalking of RuntimeStub frame
3565   __ ret(0);
3566 
3567   //////////////////////////////////////////////////////////////////////////////
3568 
3569   __ block_comment("{ L_safepoint_poll_slow_path");
3570   __ bind(L_safepoint_poll_slow_path);
3571   __ vzeroupper();
3572 
3573   spill_out_registers();
3574 
3575   __ mov(c_rarg0, r15_thread);
3576   __ mov(r12, rsp); // remember sp
3577   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3578   __ andptr(rsp, -16); // align stack as required by ABI
3579   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3580   __ mov(rsp, r12); // restore sp
3581   __ reinit_heapbase();
3582 
3583   fill_out_registers();
3584 
3585   __ jmp(L_after_safepoint_poll);
3586   __ block_comment("} L_safepoint_poll_slow_path");
3587 
3588   //////////////////////////////////////////////////////////////////////////////
3589 
3590   __ block_comment("{ L_reguard");
3591   __ bind(L_reguard);
3592   __ vzeroupper();
3593 
3594   spill_out_registers();
3595 
3596   __ mov(r12, rsp); // remember sp
3597   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3598   __ andptr(rsp, -16); // align stack as required by ABI
3599   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3600   __ mov(rsp, r12); // restore sp
3601   __ reinit_heapbase();
3602 
3603   fill_out_registers();
3604 
3605   __ jmp(L_after_reguard);
3606 
3607   __ block_comment("} L_reguard");
3608 
3609   //////////////////////////////////////////////////////////////////////////////
3610 
3611   __ flush();
3612 }
3613 #endif // COMPILER2
3614 
3615 //------------------------------Montgomery multiplication------------------------
3616 //
3617 
3618 #ifndef _WINDOWS
3619 
3620 // Subtract 0:b from carry:a.  Return carry.
3621 static julong
3622 sub(julong a[], julong b[], julong carry, long len) {
3623   long long i = 0, cnt = len;
3624   julong tmp;
3625   asm volatile("clc; "
3626                "0: ; "
3627                "mov (%[b], %[i], 8), %[tmp]; "
3628                "sbb %[tmp], (%[a], %[i], 8); "
3629                "inc %[i]; dec %[cnt]; "
3630                "jne 0b; "
3631                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3632                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3633                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3634                : "memory");
3635   return tmp;
3636 }
3637 
3638 // Multiply (unsigned) Long A by Long B, accumulating the double-
3639 // length result into the accumulator formed of T0, T1, and T2.
3640 #define MACC(A, B, T0, T1, T2)                                  \
3641 do {                                                            \
3642   unsigned long hi, lo;                                         \
3643   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3644            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3645            : "r"(A), "a"(B) : "cc");                            \
3646  } while(0)
3647 
3648 // As above, but add twice the double-length result into the
3649 // accumulator.
3650 #define MACC2(A, B, T0, T1, T2)                                 \
3651 do {                                                            \
3652   unsigned long hi, lo;                                         \
3653   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3654            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3655            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3656            : "r"(A), "a"(B) : "cc");                            \
3657  } while(0)
3658 
3659 #else //_WINDOWS
3660 
3661 static julong
3662 sub(julong a[], julong b[], julong carry, long len) {
3663   long i;
3664   julong tmp;
3665   unsigned char c = 1;
3666   for (i = 0; i < len; i++) {
3667     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3668     a[i] = tmp;
3669   }
3670   c = _addcarry_u64(c, carry, ~0, &tmp);
3671   return tmp;
3672 }
3673 
3674 // Multiply (unsigned) Long A by Long B, accumulating the double-
3675 // length result into the accumulator formed of T0, T1, and T2.
3676 #define MACC(A, B, T0, T1, T2)                          \
3677 do {                                                    \
3678   julong hi, lo;                            \
3679   lo = _umul128(A, B, &hi);                             \
3680   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3681   c = _addcarry_u64(c, hi, T1, &T1);                    \
3682   _addcarry_u64(c, T2, 0, &T2);                         \
3683  } while(0)
3684 
3685 // As above, but add twice the double-length result into the
3686 // accumulator.
3687 #define MACC2(A, B, T0, T1, T2)                         \
3688 do {                                                    \
3689   julong hi, lo;                            \
3690   lo = _umul128(A, B, &hi);                             \
3691   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3692   c = _addcarry_u64(c, hi, T1, &T1);                    \
3693   _addcarry_u64(c, T2, 0, &T2);                         \
3694   c = _addcarry_u64(0, lo, T0, &T0);                    \
3695   c = _addcarry_u64(c, hi, T1, &T1);                    \
3696   _addcarry_u64(c, T2, 0, &T2);                         \
3697  } while(0)
3698 
3699 #endif //_WINDOWS
3700 
3701 // Fast Montgomery multiplication.  The derivation of the algorithm is
3702 // in  A Cryptographic Library for the Motorola DSP56000,
3703 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3704 
3705 static void NOINLINE
3706 montgomery_multiply(julong a[], julong b[], julong n[],
3707                     julong m[], julong inv, int len) {
3708   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3709   int i;
3710 
3711   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3712 
3713   for (i = 0; i < len; i++) {
3714     int j;
3715     for (j = 0; j < i; j++) {
3716       MACC(a[j], b[i-j], t0, t1, t2);
3717       MACC(m[j], n[i-j], t0, t1, t2);
3718     }
3719     MACC(a[i], b[0], t0, t1, t2);
3720     m[i] = t0 * inv;
3721     MACC(m[i], n[0], t0, t1, t2);
3722 
3723     assert(t0 == 0, "broken Montgomery multiply");
3724 
3725     t0 = t1; t1 = t2; t2 = 0;
3726   }
3727 
3728   for (i = len; i < 2*len; i++) {
3729     int j;
3730     for (j = i-len+1; j < len; j++) {
3731       MACC(a[j], b[i-j], t0, t1, t2);
3732       MACC(m[j], n[i-j], t0, t1, t2);
3733     }
3734     m[i-len] = t0;
3735     t0 = t1; t1 = t2; t2 = 0;
3736   }
3737 
3738   while (t0)
3739     t0 = sub(m, n, t0, len);
3740 }
3741 
3742 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3743 // multiplies so it should be up to 25% faster than Montgomery
3744 // multiplication.  However, its loop control is more complex and it
3745 // may actually run slower on some machines.
3746 
3747 static void NOINLINE
3748 montgomery_square(julong a[], julong n[],
3749                   julong m[], julong inv, int len) {
3750   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3751   int i;
3752 
3753   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3754 
3755   for (i = 0; i < len; i++) {
3756     int j;
3757     int end = (i+1)/2;
3758     for (j = 0; j < end; j++) {
3759       MACC2(a[j], a[i-j], t0, t1, t2);
3760       MACC(m[j], n[i-j], t0, t1, t2);
3761     }
3762     if ((i & 1) == 0) {
3763       MACC(a[j], a[j], t0, t1, t2);
3764     }
3765     for (; j < i; j++) {
3766       MACC(m[j], n[i-j], t0, t1, t2);
3767     }
3768     m[i] = t0 * inv;
3769     MACC(m[i], n[0], t0, t1, t2);
3770 
3771     assert(t0 == 0, "broken Montgomery square");
3772 
3773     t0 = t1; t1 = t2; t2 = 0;
3774   }
3775 
3776   for (i = len; i < 2*len; i++) {
3777     int start = i-len+1;
3778     int end = start + (len - start)/2;
3779     int j;
3780     for (j = start; j < end; j++) {
3781       MACC2(a[j], a[i-j], t0, t1, t2);
3782       MACC(m[j], n[i-j], t0, t1, t2);
3783     }
3784     if ((i & 1) == 0) {
3785       MACC(a[j], a[j], t0, t1, t2);
3786     }
3787     for (; j < len; j++) {
3788       MACC(m[j], n[i-j], t0, t1, t2);
3789     }
3790     m[i-len] = t0;
3791     t0 = t1; t1 = t2; t2 = 0;
3792   }
3793 
3794   while (t0)
3795     t0 = sub(m, n, t0, len);
3796 }
3797 
3798 // Swap words in a longword.
3799 static julong swap(julong x) {
3800   return (x << 32) | (x >> 32);
3801 }
3802 
3803 // Copy len longwords from s to d, word-swapping as we go.  The
3804 // destination array is reversed.
3805 static void reverse_words(julong *s, julong *d, int len) {
3806   d += len;
3807   while(len-- > 0) {
3808     d--;
3809     *d = swap(*s);
3810     s++;
3811   }
3812 }
3813 
3814 // The threshold at which squaring is advantageous was determined
3815 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3816 #define MONTGOMERY_SQUARING_THRESHOLD 64
3817 
3818 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3819                                         jint len, jlong inv,
3820                                         jint *m_ints) {
3821   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3822   int longwords = len/2;
3823 
3824   // Make very sure we don't use so much space that the stack might
3825   // overflow.  512 jints corresponds to an 16384-bit integer and
3826   // will use here a total of 8k bytes of stack space.
3827   int divisor = sizeof(julong) * 4;
3828   guarantee(longwords <= 8192 / divisor, "must be");
3829   int total_allocation = longwords * sizeof (julong) * 4;
3830   julong *scratch = (julong *)alloca(total_allocation);
3831 
3832   // Local scratch arrays
3833   julong
3834     *a = scratch + 0 * longwords,
3835     *b = scratch + 1 * longwords,
3836     *n = scratch + 2 * longwords,
3837     *m = scratch + 3 * longwords;
3838 
3839   reverse_words((julong *)a_ints, a, longwords);
3840   reverse_words((julong *)b_ints, b, longwords);
3841   reverse_words((julong *)n_ints, n, longwords);
3842 
3843   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3844 
3845   reverse_words(m, (julong *)m_ints, longwords);
3846 }
3847 
3848 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3849                                       jint len, jlong inv,
3850                                       jint *m_ints) {
3851   assert(len % 2 == 0, "array length in montgomery_square must be even");
3852   int longwords = len/2;
3853 
3854   // Make very sure we don't use so much space that the stack might
3855   // overflow.  512 jints corresponds to an 16384-bit integer and
3856   // will use here a total of 6k bytes of stack space.
3857   int divisor = sizeof(julong) * 3;
3858   guarantee(longwords <= (8192 / divisor), "must be");
3859   int total_allocation = longwords * sizeof (julong) * 3;
3860   julong *scratch = (julong *)alloca(total_allocation);
3861 
3862   // Local scratch arrays
3863   julong
3864     *a = scratch + 0 * longwords,
3865     *n = scratch + 1 * longwords,
3866     *m = scratch + 2 * longwords;
3867 
3868   reverse_words((julong *)a_ints, a, longwords);
3869   reverse_words((julong *)n_ints, n, longwords);
3870 
3871   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3872     ::montgomery_square(a, n, m, (julong)inv, longwords);
3873   } else {
3874     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3875   }
3876 
3877   reverse_words(m, (julong *)m_ints, longwords);
3878 }
3879 
3880 #ifdef COMPILER2
3881 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3882 //
3883 //------------------------------generate_exception_blob---------------------------
3884 // creates exception blob at the end
3885 // Using exception blob, this code is jumped from a compiled method.
3886 // (see emit_exception_handler in x86_64.ad file)
3887 //
3888 // Given an exception pc at a call we call into the runtime for the
3889 // handler in this method. This handler might merely restore state
3890 // (i.e. callee save registers) unwind the frame and jump to the
3891 // exception handler for the nmethod if there is no Java level handler
3892 // for the nmethod.
3893 //
3894 // This code is entered with a jmp.
3895 //
3896 // Arguments:
3897 //   rax: exception oop
3898 //   rdx: exception pc
3899 //
3900 // Results:
3901 //   rax: exception oop
3902 //   rdx: exception pc in caller or ???
3903 //   destination: exception handler of caller
3904 //
3905 // Note: the exception pc MUST be at a call (precise debug information)
3906 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3907 //
3908 
3909 void OptoRuntime::generate_exception_blob() {
3910   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3911   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3912   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3913 
3914   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3915 
3916   // Allocate space for the code
3917   ResourceMark rm;
3918   // Setup code generation tools
3919   CodeBuffer buffer("exception_blob", 2048, 1024);
3920   MacroAssembler* masm = new MacroAssembler(&buffer);
3921 
3922 
3923   address start = __ pc();
3924 
3925   // Exception pc is 'return address' for stack walker
3926   __ push(rdx);
3927   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3928 
3929   // Save callee-saved registers.  See x86_64.ad.
3930 
3931   // rbp is an implicitly saved callee saved register (i.e., the calling
3932   // convention will save/restore it in the prolog/epilog). Other than that
3933   // there are no callee save registers now that adapter frames are gone.
3934 
3935   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3936 
3937   // Store exception in Thread object. We cannot pass any arguments to the
3938   // handle_exception call, since we do not want to make any assumption
3939   // about the size of the frame where the exception happened in.
3940   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3941   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3942   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3943 
3944   // This call does all the hard work.  It checks if an exception handler
3945   // exists in the method.
3946   // If so, it returns the handler address.
3947   // If not, it prepares for stack-unwinding, restoring the callee-save
3948   // registers of the frame being removed.
3949   //
3950   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3951 
3952   // At a method handle call, the stack may not be properly aligned
3953   // when returning with an exception.
3954   address the_pc = __ pc();
3955   __ set_last_Java_frame(noreg, noreg, the_pc);
3956   __ mov(c_rarg0, r15_thread);
3957   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3958   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3959 
3960   // Set an oopmap for the call site.  This oopmap will only be used if we
3961   // are unwinding the stack.  Hence, all locations will be dead.
3962   // Callee-saved registers will be the same as the frame above (i.e.,
3963   // handle_exception_stub), since they were restored when we got the
3964   // exception.
3965 
3966   OopMapSet* oop_maps = new OopMapSet();
3967 
3968   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3969 
3970   __ reset_last_Java_frame(false);
3971 
3972   // Restore callee-saved registers
3973 
3974   // rbp is an implicitly saved callee-saved register (i.e., the calling
3975   // convention will save restore it in prolog/epilog) Other than that
3976   // there are no callee save registers now that adapter frames are gone.
3977 
3978   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3979 
3980   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3981   __ pop(rdx);                  // No need for exception pc anymore
3982 
3983   // rax: exception handler
3984 
3985   // We have a handler in rax (could be deopt blob).
3986   __ mov(r8, rax);
3987 
3988   // Get the exception oop
3989   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3990   // Get the exception pc in case we are deoptimized
3991   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3992 #ifdef ASSERT
3993   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3994   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3995 #endif
3996   // Clear the exception oop so GC no longer processes it as a root.
3997   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3998 
3999   // rax: exception oop
4000   // r8:  exception handler
4001   // rdx: exception pc
4002   // Jump to handler
4003 
4004   __ jmp(r8);
4005 
4006   // Make sure all code is generated
4007   masm->flush();
4008 
4009   // Set exception blob
4010   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
4011 }
4012 #endif // COMPILER2
4013 
4014 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
4015   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
4016   CodeBuffer buffer(buf);
4017   short buffer_locs[20];
4018   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
4019                                          sizeof(buffer_locs)/sizeof(relocInfo));
4020 
4021   MacroAssembler* masm = new MacroAssembler(&buffer);
4022 
4023   const Array<SigEntry>* sig_vk = vk->extended_sig();
4024   const Array<VMRegPair>* regs = vk->return_regs();
4025 
4026   int pack_fields_jobject_off = __ offset();
4027   // Resolve pre-allocated buffer from JNI handle.
4028   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
4029   __ movptr(rax, Address(r13, 0));
4030   __ resolve_jobject(rax /* value */,
4031                      r15_thread /* thread */,
4032                      r12 /* tmp */);
4033   __ movptr(Address(r13, 0), rax);
4034 
4035   int pack_fields_off = __ offset();
4036 
4037   int j = 1;
4038   for (int i = 0; i < sig_vk->length(); i++) {
4039     BasicType bt = sig_vk->at(i)._bt;
4040     if (bt == T_PRIMITIVE_OBJECT) {
4041       continue;
4042     }
4043     if (bt == T_VOID) {
4044       if (sig_vk->at(i-1)._bt == T_LONG ||
4045           sig_vk->at(i-1)._bt == T_DOUBLE) {
4046         j++;
4047       }
4048       continue;
4049     }
4050     int off = sig_vk->at(i)._offset;
4051     assert(off > 0, "offset in object should be positive");
4052     VMRegPair pair = regs->at(j);
4053     VMReg r_1 = pair.first();
4054     VMReg r_2 = pair.second();
4055     Address to(rax, off);
4056     if (bt == T_FLOAT) {
4057       __ movflt(to, r_1->as_XMMRegister());
4058     } else if (bt == T_DOUBLE) {
4059       __ movdbl(to, r_1->as_XMMRegister());
4060     } else {
4061       Register val = r_1->as_Register();
4062       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
4063       if (is_reference_type(bt)) {
4064         __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
4065       } else {
4066         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
4067       }
4068     }
4069     j++;
4070   }
4071   assert(j == regs->length(), "missed a field?");
4072 
4073   __ ret(0);
4074 
4075   int unpack_fields_off = __ offset();
4076 
4077   Label skip;
4078   __ testptr(rax, rax);
4079   __ jcc(Assembler::zero, skip);
4080 
4081   j = 1;
4082   for (int i = 0; i < sig_vk->length(); i++) {
4083     BasicType bt = sig_vk->at(i)._bt;
4084     if (bt == T_PRIMITIVE_OBJECT) {
4085       continue;
4086     }
4087     if (bt == T_VOID) {
4088       if (sig_vk->at(i-1)._bt == T_LONG ||
4089           sig_vk->at(i-1)._bt == T_DOUBLE) {
4090         j++;
4091       }
4092       continue;
4093     }
4094     int off = sig_vk->at(i)._offset;
4095     assert(off > 0, "offset in object should be positive");
4096     VMRegPair pair = regs->at(j);
4097     VMReg r_1 = pair.first();
4098     VMReg r_2 = pair.second();
4099     Address from(rax, off);
4100     if (bt == T_FLOAT) {
4101       __ movflt(r_1->as_XMMRegister(), from);
4102     } else if (bt == T_DOUBLE) {
4103       __ movdbl(r_1->as_XMMRegister(), from);
4104     } else if (bt == T_OBJECT || bt == T_ARRAY) {
4105       assert_different_registers(rax, r_1->as_Register());
4106       __ load_heap_oop(r_1->as_Register(), from);
4107     } else {
4108       assert(is_java_primitive(bt), "unexpected basic type");
4109       assert_different_registers(rax, r_1->as_Register());
4110       size_t size_in_bytes = type2aelembytes(bt);
4111       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
4112     }
4113     j++;
4114   }
4115   assert(j == regs->length(), "missed a field?");
4116 
4117   __ bind(skip);
4118   __ ret(0);
4119 
4120   __ flush();
4121 
4122   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
4123 }
4124 
4125 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
4126                                        int total_in_args, const VMRegPair* in_regs,
4127                                        int total_out_args, VMRegPair* out_regs,
4128                                        GrowableArray<int>& arg_order,
4129                                        VMRegPair tmp_vmreg) {
4130   ComputeMoveOrder order(total_in_args, in_regs,
4131                          total_out_args, out_regs,
4132                          in_sig_bt, arg_order, tmp_vmreg);
4133 }