1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/icBuffer.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/compiledICHolder.hpp"
  46 #include "oops/klass.inline.hpp"
  47 #include "prims/methodHandles.hpp"
  48 #include "runtime/jniHandles.hpp"
  49 #include "runtime/safepointMechanism.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/signature.hpp"
  52 #include "runtime/stubRoutines.hpp"
  53 #include "runtime/vframeArray.hpp"
  54 #include "runtime/vm_version.hpp"
  55 #include "utilities/align.hpp"
  56 #include "utilities/formatBuffer.hpp"
  57 #include "vmreg_x86.inline.hpp"
  58 #ifdef COMPILER1
  59 #include "c1/c1_Runtime1.hpp"
  60 #endif
  61 #ifdef COMPILER2
  62 #include "opto/runtime.hpp"
  63 #endif
  64 #if INCLUDE_JVMCI
  65 #include "jvmci/jvmciJavaClasses.hpp"
  66 #endif
  67 
  68 #define __ masm->
  69 
  70 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  71 
  72 class SimpleRuntimeFrame {
  73 
  74   public:
  75 
  76   // Most of the runtime stubs have this simple frame layout.
  77   // This class exists to make the layout shared in one place.
  78   // Offsets are for compiler stack slots, which are jints.
  79   enum layout {
  80     // The frame sender code expects that rbp will be in the "natural" place and
  81     // will override any oopMap setting for it. We must therefore force the layout
  82     // so that it agrees with the frame sender code.
  83     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  84     rbp_off2,
  85     return_off, return_off2,
  86     framesize
  87   };
  88 };
  89 
  90 class RegisterSaver {
  91   // Capture info about frame layout.  Layout offsets are in jint
  92   // units because compiler frame slots are jints.
  93 #define XSAVE_AREA_BEGIN 160
  94 #define XSAVE_AREA_YMM_BEGIN 576
  95 #define XSAVE_AREA_OPMASK_BEGIN 1088
  96 #define XSAVE_AREA_ZMM_BEGIN 1152
  97 #define XSAVE_AREA_UPPERBANK 1664
  98 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  99 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 100 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 101 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 102 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 103   enum layout {
 104     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 105     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 106     DEF_XMM_OFFS(0),
 107     DEF_XMM_OFFS(1),
 108     // 2..15 are implied in range usage
 109     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 110     DEF_YMM_OFFS(0),
 111     DEF_YMM_OFFS(1),
 112     // 2..15 are implied in range usage
 113     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 114     DEF_OPMASK_OFFS(0),
 115     DEF_OPMASK_OFFS(1),
 116     // 2..7 are implied in range usage
 117     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 118     DEF_ZMM_OFFS(0),
 119     DEF_ZMM_OFFS(1),
 120     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 121     DEF_ZMM_UPPER_OFFS(16),
 122     DEF_ZMM_UPPER_OFFS(17),
 123     // 18..31 are implied in range usage
 124     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 125     fpu_stateH_end,
 126     r15_off, r15H_off,
 127     r14_off, r14H_off,
 128     r13_off, r13H_off,
 129     r12_off, r12H_off,
 130     r11_off, r11H_off,
 131     r10_off, r10H_off,
 132     r9_off,  r9H_off,
 133     r8_off,  r8H_off,
 134     rdi_off, rdiH_off,
 135     rsi_off, rsiH_off,
 136     ignore_off, ignoreH_off,  // extra copy of rbp
 137     rsp_off, rspH_off,
 138     rbx_off, rbxH_off,
 139     rdx_off, rdxH_off,
 140     rcx_off, rcxH_off,
 141     rax_off, raxH_off,
 142     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 143     align_off, alignH_off,
 144     flags_off, flagsH_off,
 145     // The frame sender code expects that rbp will be in the "natural" place and
 146     // will override any oopMap setting for it. We must therefore force the layout
 147     // so that it agrees with the frame sender code.
 148     rbp_off, rbpH_off,        // copy of rbp we will restore
 149     return_off, returnH_off,  // slot for return address
 150     reg_save_size             // size in compiler stack slots
 151   };
 152 
 153  public:
 154   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors);
 155   static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 156 
 157   // Offsets into the register save area
 158   // Used by deoptimization when it is managing result register
 159   // values on its own
 160 
 161   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 162   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 163   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 164   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 165   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 166 
 167   // During deoptimization only the result registers need to be restored,
 168   // all the other values have already been extracted.
 169   static void restore_result_registers(MacroAssembler* masm);
 170 };
 171 
 172 // Register is a class, but it would be assigned numerical value.
 173 // "0" is assigned for rax. Thus we need to ignore -Wnonnull.
 174 PRAGMA_DIAG_PUSH
 175 PRAGMA_NONNULL_IGNORED
 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
 177   int off = 0;
 178   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 179   if (UseAVX < 3) {
 180     num_xmm_regs = num_xmm_regs/2;
 181   }
 182 #if COMPILER2_OR_JVMCI
 183   if (save_vectors && UseAVX == 0) {
 184     save_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 185   }
 186   assert(!save_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 187 #else
 188   save_vectors = false; // vectors are generated only by C2 and JVMCI
 189 #endif
 190 
 191   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 192   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 193   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 194   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 195   // CodeBlob frame size is in words.
 196   int frame_size_in_words = frame_size_in_bytes / wordSize;
 197   *total_frame_words = frame_size_in_words;
 198 
 199   // Save registers, fpu state, and flags.
 200   // We assume caller has already pushed the return address onto the
 201   // stack, so rsp is 8-byte aligned here.
 202   // We push rpb twice in this sequence because we want the real rbp
 203   // to be under the return like a normal enter.
 204 
 205   __ enter();          // rsp becomes 16-byte aligned here
 206   __ push_CPU_state(); // Push a multiple of 16 bytes
 207 
 208   // push cpu state handles this on EVEX enabled targets
 209   if (save_vectors) {
 210     // Save upper half of YMM registers(0..15)
 211     int base_addr = XSAVE_AREA_YMM_BEGIN;
 212     for (int n = 0; n < 16; n++) {
 213       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 214     }
 215     if (VM_Version::supports_evex()) {
 216       // Save upper half of ZMM registers(0..15)
 217       base_addr = XSAVE_AREA_ZMM_BEGIN;
 218       for (int n = 0; n < 16; n++) {
 219         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 220       }
 221       // Save full ZMM registers(16..num_xmm_regs)
 222       base_addr = XSAVE_AREA_UPPERBANK;
 223       off = 0;
 224       int vector_len = Assembler::AVX_512bit;
 225       for (int n = 16; n < num_xmm_regs; n++) {
 226         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 227       }
 228 #if COMPILER2_OR_JVMCI
 229       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 230       off = 0;
 231       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 232         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 233       }
 234 #endif
 235     }
 236   } else {
 237     if (VM_Version::supports_evex()) {
 238       // Save upper bank of ZMM registers(16..31) for double/float usage
 239       int base_addr = XSAVE_AREA_UPPERBANK;
 240       off = 0;
 241       for (int n = 16; n < num_xmm_regs; n++) {
 242         __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
 243       }
 244 #if COMPILER2_OR_JVMCI
 245       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 246       off = 0;
 247       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 248         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 249       }
 250 #endif
 251     }
 252   }
 253   __ vzeroupper();
 254   if (frame::arg_reg_save_area_bytes != 0) {
 255     // Allocate argument register save area
 256     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 257   }
 258 
 259   // Set an oopmap for the call site.  This oopmap will map all
 260   // oop-registers and debug-info registers as callee-saved.  This
 261   // will allow deoptimization at this safepoint to find all possible
 262   // debug-info recordings, as well as let GC find all oops.
 263 
 264   OopMapSet *oop_maps = new OopMapSet();
 265   OopMap* map = new OopMap(frame_size_in_slots, 0);
 266 
 267 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 268 
 269   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 271   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 272   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 273   // rbp location is known implicitly by the frame sender code, needs no oopmap
 274   // and the location where rbp was saved by is ignored
 275   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 283   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 284   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 285   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 286   // on EVEX enabled targets, we get it included in the xsave area
 287   off = xmm0_off;
 288   int delta = xmm1_off - off;
 289   for (int n = 0; n < 16; n++) {
 290     XMMRegister xmm_name = as_XMMRegister(n);
 291     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 292     off += delta;
 293   }
 294   if (UseAVX > 2) {
 295     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 296     off = zmm16_off;
 297     delta = zmm17_off - off;
 298     for (int n = 16; n < num_xmm_regs; n++) {
 299       XMMRegister zmm_name = as_XMMRegister(n);
 300       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 301       off += delta;
 302     }
 303   }
 304 
 305 #if COMPILER2_OR_JVMCI
 306   if (save_vectors) {
 307     // Save upper half of YMM registers(0..15)
 308     off = ymm0_off;
 309     delta = ymm1_off - ymm0_off;
 310     for (int n = 0; n < 16; n++) {
 311       XMMRegister ymm_name = as_XMMRegister(n);
 312       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 313       off += delta;
 314     }
 315     if (VM_Version::supports_evex()) {
 316       // Save upper half of ZMM registers(0..15)
 317       off = zmm0_off;
 318       delta = zmm1_off - zmm0_off;
 319       for (int n = 0; n < 16; n++) {
 320         XMMRegister zmm_name = as_XMMRegister(n);
 321         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 322         off += delta;
 323       }
 324     }
 325   }
 326 #endif // COMPILER2_OR_JVMCI
 327 
 328   // %%% These should all be a waste but we'll keep things as they were for now
 329   if (true) {
 330     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 334     // rbp location is known implicitly by the frame sender code, needs no oopmap
 335     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 343     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 344     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 345     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 346     // on EVEX enabled targets, we get it included in the xsave area
 347     off = xmm0H_off;
 348     delta = xmm1H_off - off;
 349     for (int n = 0; n < 16; n++) {
 350       XMMRegister xmm_name = as_XMMRegister(n);
 351       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 352       off += delta;
 353     }
 354     if (UseAVX > 2) {
 355       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 356       off = zmm16H_off;
 357       delta = zmm17H_off - off;
 358       for (int n = 16; n < num_xmm_regs; n++) {
 359         XMMRegister zmm_name = as_XMMRegister(n);
 360         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 361         off += delta;
 362       }
 363     }
 364   }
 365 
 366   return map;
 367 }
 368 PRAGMA_DIAG_POP
 369 
 370 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
 371   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 372   if (UseAVX < 3) {
 373     num_xmm_regs = num_xmm_regs/2;
 374   }
 375   if (frame::arg_reg_save_area_bytes != 0) {
 376     // Pop arg register save area
 377     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 378   }
 379 
 380 #if COMPILER2_OR_JVMCI
 381   if (restore_vectors) {
 382     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 383     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 384   }
 385 #else
 386   assert(!restore_vectors, "vectors are generated only by C2");
 387 #endif
 388 
 389   __ vzeroupper();
 390 
 391   // On EVEX enabled targets everything is handled in pop fpu state
 392   if (restore_vectors) {
 393     // Restore upper half of YMM registers (0..15)
 394     int base_addr = XSAVE_AREA_YMM_BEGIN;
 395     for (int n = 0; n < 16; n++) {
 396       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 397     }
 398     if (VM_Version::supports_evex()) {
 399       // Restore upper half of ZMM registers (0..15)
 400       base_addr = XSAVE_AREA_ZMM_BEGIN;
 401       for (int n = 0; n < 16; n++) {
 402         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 403       }
 404       // Restore full ZMM registers(16..num_xmm_regs)
 405       base_addr = XSAVE_AREA_UPPERBANK;
 406       int vector_len = Assembler::AVX_512bit;
 407       int off = 0;
 408       for (int n = 16; n < num_xmm_regs; n++) {
 409         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 410       }
 411 #if COMPILER2_OR_JVMCI
 412       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 413       off = 0;
 414       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 415         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 416       }
 417 #endif
 418     }
 419   } else {
 420     if (VM_Version::supports_evex()) {
 421       // Restore upper bank of ZMM registers(16..31) for double/float usage
 422       int base_addr = XSAVE_AREA_UPPERBANK;
 423       int off = 0;
 424       for (int n = 16; n < num_xmm_regs; n++) {
 425         __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
 426       }
 427 #if COMPILER2_OR_JVMCI
 428       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 429       off = 0;
 430       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 431         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 432       }
 433 #endif
 434     }
 435   }
 436 
 437   // Recover CPU state
 438   __ pop_CPU_state();
 439   // Get the rbp described implicitly by the calling convention (no oopMap)
 440   __ pop(rbp);
 441 }
 442 
 443 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 444 
 445   // Just restore result register. Only used by deoptimization. By
 446   // now any callee save register that needs to be restored to a c2
 447   // caller of the deoptee has been extracted into the vframeArray
 448   // and will be stuffed into the c2i adapter we create for later
 449   // restoration so only result registers need to be restored here.
 450 
 451   // Restore fp result register
 452   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 453   // Restore integer result register
 454   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 455   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 456 
 457   // Pop all of the register save are off the stack except the return address
 458   __ addptr(rsp, return_offset_in_bytes());
 459 }
 460 
 461 // Is vector's size (in bytes) bigger than a size saved by default?
 462 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 463 bool SharedRuntime::is_wide_vector(int size) {
 464   return size > 16;
 465 }
 466 
 467 // ---------------------------------------------------------------------------
 468 // Read the array of BasicTypes from a signature, and compute where the
 469 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 470 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 471 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 472 // as framesizes are fixed.
 473 // VMRegImpl::stack0 refers to the first slot 0(sp).
 474 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 475 // up to RegisterImpl::number_of_registers) are the 64-bit
 476 // integer registers.
 477 
 478 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 479 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 480 // units regardless of build. Of course for i486 there is no 64 bit build
 481 
 482 // The Java calling convention is a "shifted" version of the C ABI.
 483 // By skipping the first C ABI register we can call non-static jni methods
 484 // with small numbers of arguments without having to shuffle the arguments
 485 // at all. Since we control the java ABI we ought to at least get some
 486 // advantage out of it.
 487 
 488 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 489                                            VMRegPair *regs,
 490                                            int total_args_passed) {
 491 
 492   // Create the mapping between argument positions and
 493   // registers.
 494   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 495     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 496   };
 497   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 498     j_farg0, j_farg1, j_farg2, j_farg3,
 499     j_farg4, j_farg5, j_farg6, j_farg7
 500   };
 501 
 502 
 503   uint int_args = 0;
 504   uint fp_args = 0;
 505   uint stk_args = 0; // inc by 2 each time
 506 
 507   for (int i = 0; i < total_args_passed; i++) {
 508     switch (sig_bt[i]) {
 509     case T_BOOLEAN:
 510     case T_CHAR:
 511     case T_BYTE:
 512     case T_SHORT:
 513     case T_INT:
 514       if (int_args < Argument::n_int_register_parameters_j) {
 515         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 516       } else {
 517         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 518         stk_args += 2;
 519       }
 520       break;
 521     case T_VOID:
 522       // halves of T_LONG or T_DOUBLE
 523       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 524       regs[i].set_bad();
 525       break;
 526     case T_LONG:
 527       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 528       // fall through
 529     case T_OBJECT:
 530     case T_ARRAY:
 531     case T_ADDRESS:
 532       if (int_args < Argument::n_int_register_parameters_j) {
 533         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 534       } else {
 535         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 536         stk_args += 2;
 537       }
 538       break;
 539     case T_FLOAT:
 540       if (fp_args < Argument::n_float_register_parameters_j) {
 541         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 542       } else {
 543         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 544         stk_args += 2;
 545       }
 546       break;
 547     case T_DOUBLE:
 548       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 549       if (fp_args < Argument::n_float_register_parameters_j) {
 550         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 551       } else {
 552         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 553         stk_args += 2;
 554       }
 555       break;
 556     default:
 557       ShouldNotReachHere();
 558       break;
 559     }
 560   }
 561 
 562   return align_up(stk_args, 2);
 563 }
 564 
 565 // Patch the callers callsite with entry to compiled code if it exists.
 566 static void patch_callers_callsite(MacroAssembler *masm) {
 567   Label L;
 568   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 569   __ jcc(Assembler::equal, L);
 570 
 571   // Save the current stack pointer
 572   __ mov(r13, rsp);
 573   // Schedule the branch target address early.
 574   // Call into the VM to patch the caller, then jump to compiled callee
 575   // rax isn't live so capture return address while we easily can
 576   __ movptr(rax, Address(rsp, 0));
 577 
 578   // align stack so push_CPU_state doesn't fault
 579   __ andptr(rsp, -(StackAlignmentInBytes));
 580   __ push_CPU_state();
 581   __ vzeroupper();
 582   // VM needs caller's callsite
 583   // VM needs target method
 584   // This needs to be a long call since we will relocate this adapter to
 585   // the codeBuffer and it may not reach
 586 
 587   // Allocate argument register save area
 588   if (frame::arg_reg_save_area_bytes != 0) {
 589     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 590   }
 591   __ mov(c_rarg0, rbx);
 592   __ mov(c_rarg1, rax);
 593   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 594 
 595   // De-allocate argument register save area
 596   if (frame::arg_reg_save_area_bytes != 0) {
 597     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 598   }
 599 
 600   __ vzeroupper();
 601   __ pop_CPU_state();
 602   // restore sp
 603   __ mov(rsp, r13);
 604   __ bind(L);
 605 }
 606 
 607 
 608 static void gen_c2i_adapter(MacroAssembler *masm,
 609                             int total_args_passed,
 610                             int comp_args_on_stack,
 611                             const BasicType *sig_bt,
 612                             const VMRegPair *regs,
 613                             Label& skip_fixup) {
 614   // Before we get into the guts of the C2I adapter, see if we should be here
 615   // at all.  We've come from compiled code and are attempting to jump to the
 616   // interpreter, which means the caller made a static call to get here
 617   // (vcalls always get a compiled target if there is one).  Check for a
 618   // compiled target.  If there is one, we need to patch the caller's call.
 619   patch_callers_callsite(masm);
 620 
 621   __ bind(skip_fixup);
 622 
 623   // Since all args are passed on the stack, total_args_passed *
 624   // Interpreter::stackElementSize is the space we need. Plus 1 because
 625   // we also account for the return address location since
 626   // we store it first rather than hold it in rax across all the shuffling
 627 
 628   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 629 
 630   // stack is aligned, keep it that way
 631   extraspace = align_up(extraspace, 2*wordSize);
 632 
 633   // Get return address
 634   __ pop(rax);
 635 
 636   // set senderSP value
 637   __ mov(r13, rsp);
 638 
 639   __ subptr(rsp, extraspace);
 640 
 641   // Store the return address in the expected location
 642   __ movptr(Address(rsp, 0), rax);
 643 
 644   // Now write the args into the outgoing interpreter space
 645   for (int i = 0; i < total_args_passed; i++) {
 646     if (sig_bt[i] == T_VOID) {
 647       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 648       continue;
 649     }
 650 
 651     // offset to start parameters
 652     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 653     int next_off = st_off - Interpreter::stackElementSize;
 654 
 655     // Say 4 args:
 656     // i   st_off
 657     // 0   32 T_LONG
 658     // 1   24 T_VOID
 659     // 2   16 T_OBJECT
 660     // 3    8 T_BOOL
 661     // -    0 return address
 662     //
 663     // However to make thing extra confusing. Because we can fit a long/double in
 664     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 665     // leaves one slot empty and only stores to a single slot. In this case the
 666     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 667 
 668     VMReg r_1 = regs[i].first();
 669     VMReg r_2 = regs[i].second();
 670     if (!r_1->is_valid()) {
 671       assert(!r_2->is_valid(), "");
 672       continue;
 673     }
 674     if (r_1->is_stack()) {
 675       // memory to memory use rax
 676       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 677       if (!r_2->is_valid()) {
 678         // sign extend??
 679         __ movl(rax, Address(rsp, ld_off));
 680         __ movptr(Address(rsp, st_off), rax);
 681 
 682       } else {
 683 
 684         __ movq(rax, Address(rsp, ld_off));
 685 
 686         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 687         // T_DOUBLE and T_LONG use two slots in the interpreter
 688         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 689           // ld_off == LSW, ld_off+wordSize == MSW
 690           // st_off == MSW, next_off == LSW
 691           __ movq(Address(rsp, next_off), rax);
 692 #ifdef ASSERT
 693           // Overwrite the unused slot with known junk
 694           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 695           __ movptr(Address(rsp, st_off), rax);
 696 #endif /* ASSERT */
 697         } else {
 698           __ movq(Address(rsp, st_off), rax);
 699         }
 700       }
 701     } else if (r_1->is_Register()) {
 702       Register r = r_1->as_Register();
 703       if (!r_2->is_valid()) {
 704         // must be only an int (or less ) so move only 32bits to slot
 705         // why not sign extend??
 706         __ movl(Address(rsp, st_off), r);
 707       } else {
 708         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 709         // T_DOUBLE and T_LONG use two slots in the interpreter
 710         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 711           // long/double in gpr
 712 #ifdef ASSERT
 713           // Overwrite the unused slot with known junk
 714           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 715           __ movptr(Address(rsp, st_off), rax);
 716 #endif /* ASSERT */
 717           __ movq(Address(rsp, next_off), r);
 718         } else {
 719           __ movptr(Address(rsp, st_off), r);
 720         }
 721       }
 722     } else {
 723       assert(r_1->is_XMMRegister(), "");
 724       if (!r_2->is_valid()) {
 725         // only a float use just part of the slot
 726         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 727       } else {
 728 #ifdef ASSERT
 729         // Overwrite the unused slot with known junk
 730         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 731         __ movptr(Address(rsp, st_off), rax);
 732 #endif /* ASSERT */
 733         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 734       }
 735     }
 736   }
 737 
 738   // Schedule the branch target address early.
 739   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 740   __ jmp(rcx);
 741 }
 742 
 743 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 744                         address code_start, address code_end,
 745                         Label& L_ok) {
 746   Label L_fail;
 747   __ lea(temp_reg, ExternalAddress(code_start));
 748   __ cmpptr(pc_reg, temp_reg);
 749   __ jcc(Assembler::belowEqual, L_fail);
 750   __ lea(temp_reg, ExternalAddress(code_end));
 751   __ cmpptr(pc_reg, temp_reg);
 752   __ jcc(Assembler::below, L_ok);
 753   __ bind(L_fail);
 754 }
 755 
 756 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 757                                     int total_args_passed,
 758                                     int comp_args_on_stack,
 759                                     const BasicType *sig_bt,
 760                                     const VMRegPair *regs) {
 761 
 762   // Note: r13 contains the senderSP on entry. We must preserve it since
 763   // we may do a i2c -> c2i transition if we lose a race where compiled
 764   // code goes non-entrant while we get args ready.
 765   // In addition we use r13 to locate all the interpreter args as
 766   // we must align the stack to 16 bytes on an i2c entry else we
 767   // lose alignment we expect in all compiled code and register
 768   // save code can segv when fxsave instructions find improperly
 769   // aligned stack pointer.
 770 
 771   // Adapters can be frameless because they do not require the caller
 772   // to perform additional cleanup work, such as correcting the stack pointer.
 773   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 774   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 775   // even if a callee has modified the stack pointer.
 776   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 777   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 778   // up via the senderSP register).
 779   // In other words, if *either* the caller or callee is interpreted, we can
 780   // get the stack pointer repaired after a call.
 781   // This is why c2i and i2c adapters cannot be indefinitely composed.
 782   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 783   // both caller and callee would be compiled methods, and neither would
 784   // clean up the stack pointer changes performed by the two adapters.
 785   // If this happens, control eventually transfers back to the compiled
 786   // caller, but with an uncorrected stack, causing delayed havoc.
 787 
 788   // Pick up the return address
 789   __ movptr(rax, Address(rsp, 0));
 790 
 791   if (VerifyAdapterCalls &&
 792       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 793     // So, let's test for cascading c2i/i2c adapters right now.
 794     //  assert(Interpreter::contains($return_addr) ||
 795     //         StubRoutines::contains($return_addr),
 796     //         "i2c adapter must return to an interpreter frame");
 797     __ block_comment("verify_i2c { ");
 798     Label L_ok;
 799     if (Interpreter::code() != NULL)
 800       range_check(masm, rax, r11,
 801                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 802                   L_ok);
 803     if (StubRoutines::code1() != NULL)
 804       range_check(masm, rax, r11,
 805                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 806                   L_ok);
 807     if (StubRoutines::code2() != NULL)
 808       range_check(masm, rax, r11,
 809                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 810                   L_ok);
 811     const char* msg = "i2c adapter must return to an interpreter frame";
 812     __ block_comment(msg);
 813     __ stop(msg);
 814     __ bind(L_ok);
 815     __ block_comment("} verify_i2ce ");
 816   }
 817 
 818   // Must preserve original SP for loading incoming arguments because
 819   // we need to align the outgoing SP for compiled code.
 820   __ movptr(r11, rsp);
 821 
 822   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
 823   // in registers, we will occasionally have no stack args.
 824   int comp_words_on_stack = 0;
 825   if (comp_args_on_stack) {
 826     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
 827     // registers are below.  By subtracting stack0, we either get a negative
 828     // number (all values in registers) or the maximum stack slot accessed.
 829 
 830     // Convert 4-byte c2 stack slots to words.
 831     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 832     // Round up to miminum stack alignment, in wordSize
 833     comp_words_on_stack = align_up(comp_words_on_stack, 2);
 834     __ subptr(rsp, comp_words_on_stack * wordSize);
 835   }
 836 
 837 
 838   // Ensure compiled code always sees stack at proper alignment
 839   __ andptr(rsp, -16);
 840 
 841   // push the return address and misalign the stack that youngest frame always sees
 842   // as far as the placement of the call instruction
 843   __ push(rax);
 844 
 845   // Put saved SP in another register
 846   const Register saved_sp = rax;
 847   __ movptr(saved_sp, r11);
 848 
 849   // Will jump to the compiled code just as if compiled code was doing it.
 850   // Pre-load the register-jump target early, to schedule it better.
 851   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 852 
 853 #if INCLUDE_JVMCI
 854   if (EnableJVMCI) {
 855     // check if this call should be routed towards a specific entry point
 856     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 857     Label no_alternative_target;
 858     __ jcc(Assembler::equal, no_alternative_target);
 859     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 860     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 861     __ bind(no_alternative_target);
 862   }
 863 #endif // INCLUDE_JVMCI
 864 
 865   // Now generate the shuffle code.  Pick up all register args and move the
 866   // rest through the floating point stack top.
 867   for (int i = 0; i < total_args_passed; i++) {
 868     if (sig_bt[i] == T_VOID) {
 869       // Longs and doubles are passed in native word order, but misaligned
 870       // in the 32-bit build.
 871       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 872       continue;
 873     }
 874 
 875     // Pick up 0, 1 or 2 words from SP+offset.
 876 
 877     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 878             "scrambled load targets?");
 879     // Load in argument order going down.
 880     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 881     // Point to interpreter value (vs. tag)
 882     int next_off = ld_off - Interpreter::stackElementSize;
 883     //
 884     //
 885     //
 886     VMReg r_1 = regs[i].first();
 887     VMReg r_2 = regs[i].second();
 888     if (!r_1->is_valid()) {
 889       assert(!r_2->is_valid(), "");
 890       continue;
 891     }
 892     if (r_1->is_stack()) {
 893       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 894       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 895 
 896       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 897       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 898       // will be generated.
 899       if (!r_2->is_valid()) {
 900         // sign extend???
 901         __ movl(r13, Address(saved_sp, ld_off));
 902         __ movptr(Address(rsp, st_off), r13);
 903       } else {
 904         //
 905         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 906         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 907         // So we must adjust where to pick up the data to match the interpreter.
 908         //
 909         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 910         // are accessed as negative so LSW is at LOW address
 911 
 912         // ld_off is MSW so get LSW
 913         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 914                            next_off : ld_off;
 915         __ movq(r13, Address(saved_sp, offset));
 916         // st_off is LSW (i.e. reg.first())
 917         __ movq(Address(rsp, st_off), r13);
 918       }
 919     } else if (r_1->is_Register()) {  // Register argument
 920       Register r = r_1->as_Register();
 921       assert(r != rax, "must be different");
 922       if (r_2->is_valid()) {
 923         //
 924         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 925         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 926         // So we must adjust where to pick up the data to match the interpreter.
 927 
 928         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 929                            next_off : ld_off;
 930 
 931         // this can be a misaligned move
 932         __ movq(r, Address(saved_sp, offset));
 933       } else {
 934         // sign extend and use a full word?
 935         __ movl(r, Address(saved_sp, ld_off));
 936       }
 937     } else {
 938       if (!r_2->is_valid()) {
 939         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 940       } else {
 941         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 942       }
 943     }
 944   }
 945 
 946   __ push_cont_fastpath(r15_thread); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 947 
 948   // 6243940 We might end up in handle_wrong_method if
 949   // the callee is deoptimized as we race thru here. If that
 950   // happens we don't want to take a safepoint because the
 951   // caller frame will look interpreted and arguments are now
 952   // "compiled" so it is much better to make this transition
 953   // invisible to the stack walking code. Unfortunately if
 954   // we try and find the callee by normal means a safepoint
 955   // is possible. So we stash the desired callee in the thread
 956   // and the vm will find there should this case occur.
 957 
 958   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 959 
 960   // put Method* where a c2i would expect should we end up there
 961   // only needed becaus eof c2 resolve stubs return Method* as a result in
 962   // rax
 963   __ mov(rax, rbx);
 964   __ jmp(r11);
 965 }
 966 
 967 // ---------------------------------------------------------------
 968 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 969                                                             int total_args_passed,
 970                                                             int comp_args_on_stack,
 971                                                             const BasicType *sig_bt,
 972                                                             const VMRegPair *regs,
 973                                                             AdapterFingerPrint* fingerprint) {
 974   address i2c_entry = __ pc();
 975 
 976   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 977 
 978   // -------------------------------------------------------------------------
 979   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 980   // to the interpreter.  The args start out packed in the compiled layout.  They
 981   // need to be unpacked into the interpreter layout.  This will almost always
 982   // require some stack space.  We grow the current (compiled) stack, then repack
 983   // the args.  We  finally end in a jump to the generic interpreter entry point.
 984   // On exit from the interpreter, the interpreter will restore our SP (lest the
 985   // compiled code, which relys solely on SP and not RBP, get sick).
 986 
 987   address c2i_unverified_entry = __ pc();
 988   Label skip_fixup;
 989   Label ok;
 990 
 991   Register holder = rax;
 992   Register receiver = j_rarg0;
 993   Register temp = rbx;
 994 
 995   {
 996     __ load_klass(temp, receiver, rscratch1);
 997     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
 998     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
 999     __ jcc(Assembler::equal, ok);
1000     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1001 
1002     __ bind(ok);
1003     // Method might have been compiled since the call site was patched to
1004     // interpreted if that is the case treat it as a miss so we can get
1005     // the call site corrected.
1006     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1007     __ jcc(Assembler::equal, skip_fixup);
1008     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1009   }
1010 
1011   address c2i_entry = __ pc();
1012 
1013   // Class initialization barrier for static methods
1014   address c2i_no_clinit_check_entry = NULL;
1015   if (VM_Version::supports_fast_class_init_checks()) {
1016     Label L_skip_barrier;
1017     Register method = rbx;
1018 
1019     { // Bypass the barrier for non-static methods
1020       Register flags  = rscratch1;
1021       __ movl(flags, Address(method, Method::access_flags_offset()));
1022       __ testl(flags, JVM_ACC_STATIC);
1023       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1024     }
1025 
1026     Register klass = rscratch1;
1027     __ load_method_holder(klass, method);
1028     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1029 
1030     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1031 
1032     __ bind(L_skip_barrier);
1033     c2i_no_clinit_check_entry = __ pc();
1034   }
1035 
1036   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1037   bs->c2i_entry_barrier(masm);
1038 
1039   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1040 
1041   __ flush();
1042   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1043 }
1044 
1045 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1046                                          VMRegPair *regs,
1047                                          VMRegPair *regs2,
1048                                          int total_args_passed) {
1049   assert(regs2 == NULL, "not needed on x86");
1050 // We return the amount of VMRegImpl stack slots we need to reserve for all
1051 // the arguments NOT counting out_preserve_stack_slots.
1052 
1053 // NOTE: These arrays will have to change when c1 is ported
1054 #ifdef _WIN64
1055     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1056       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1057     };
1058     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1059       c_farg0, c_farg1, c_farg2, c_farg3
1060     };
1061 #else
1062     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1063       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1064     };
1065     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1066       c_farg0, c_farg1, c_farg2, c_farg3,
1067       c_farg4, c_farg5, c_farg6, c_farg7
1068     };
1069 #endif // _WIN64
1070 
1071 
1072     uint int_args = 0;
1073     uint fp_args = 0;
1074     uint stk_args = 0; // inc by 2 each time
1075 
1076     for (int i = 0; i < total_args_passed; i++) {
1077       switch (sig_bt[i]) {
1078       case T_BOOLEAN:
1079       case T_CHAR:
1080       case T_BYTE:
1081       case T_SHORT:
1082       case T_INT:
1083         if (int_args < Argument::n_int_register_parameters_c) {
1084           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1085 #ifdef _WIN64
1086           fp_args++;
1087           // Allocate slots for callee to stuff register args the stack.
1088           stk_args += 2;
1089 #endif
1090         } else {
1091           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1092           stk_args += 2;
1093         }
1094         break;
1095       case T_LONG:
1096         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1097         // fall through
1098       case T_OBJECT:
1099       case T_ARRAY:
1100       case T_ADDRESS:
1101       case T_METADATA:
1102         if (int_args < Argument::n_int_register_parameters_c) {
1103           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1104 #ifdef _WIN64
1105           fp_args++;
1106           stk_args += 2;
1107 #endif
1108         } else {
1109           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1110           stk_args += 2;
1111         }
1112         break;
1113       case T_FLOAT:
1114         if (fp_args < Argument::n_float_register_parameters_c) {
1115           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1116 #ifdef _WIN64
1117           int_args++;
1118           // Allocate slots for callee to stuff register args the stack.
1119           stk_args += 2;
1120 #endif
1121         } else {
1122           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1123           stk_args += 2;
1124         }
1125         break;
1126       case T_DOUBLE:
1127         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1128         if (fp_args < Argument::n_float_register_parameters_c) {
1129           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1130 #ifdef _WIN64
1131           int_args++;
1132           // Allocate slots for callee to stuff register args the stack.
1133           stk_args += 2;
1134 #endif
1135         } else {
1136           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1137           stk_args += 2;
1138         }
1139         break;
1140       case T_VOID: // Halves of longs and doubles
1141         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1142         regs[i].set_bad();
1143         break;
1144       default:
1145         ShouldNotReachHere();
1146         break;
1147       }
1148     }
1149 #ifdef _WIN64
1150   // windows abi requires that we always allocate enough stack space
1151   // for 4 64bit registers to be stored down.
1152   if (stk_args < 8) {
1153     stk_args = 8;
1154   }
1155 #endif // _WIN64
1156 
1157   return stk_args;
1158 }
1159 
1160 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1161                                              uint num_bits,
1162                                              uint total_args_passed) {
1163   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1164          "only certain vector sizes are supported for now");
1165 
1166   static const XMMRegister VEC_ArgReg[32] = {
1167      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1168      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1169     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1170     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1171   };
1172 
1173   uint stk_args = 0;
1174   uint fp_args = 0;
1175 
1176   for (uint i = 0; i < total_args_passed; i++) {
1177     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1178     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1179     regs[i].set_pair(vmreg->next(next_val), vmreg);
1180   }
1181 
1182   return stk_args;
1183 }
1184 
1185 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1186   // We always ignore the frame_slots arg and just use the space just below frame pointer
1187   // which by this time is free to use
1188   switch (ret_type) {
1189   case T_FLOAT:
1190     __ movflt(Address(rbp, -wordSize), xmm0);
1191     break;
1192   case T_DOUBLE:
1193     __ movdbl(Address(rbp, -wordSize), xmm0);
1194     break;
1195   case T_VOID:  break;
1196   default: {
1197     __ movptr(Address(rbp, -wordSize), rax);
1198     }
1199   }
1200 }
1201 
1202 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1203   // We always ignore the frame_slots arg and just use the space just below frame pointer
1204   // which by this time is free to use
1205   switch (ret_type) {
1206   case T_FLOAT:
1207     __ movflt(xmm0, Address(rbp, -wordSize));
1208     break;
1209   case T_DOUBLE:
1210     __ movdbl(xmm0, Address(rbp, -wordSize));
1211     break;
1212   case T_VOID:  break;
1213   default: {
1214     __ movptr(rax, Address(rbp, -wordSize));
1215     }
1216   }
1217 }
1218 
1219 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1220     for ( int i = first_arg ; i < arg_count ; i++ ) {
1221       if (args[i].first()->is_Register()) {
1222         __ push(args[i].first()->as_Register());
1223       } else if (args[i].first()->is_XMMRegister()) {
1224         __ subptr(rsp, 2*wordSize);
1225         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1226       }
1227     }
1228 }
1229 
1230 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1231     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1232       if (args[i].first()->is_Register()) {
1233         __ pop(args[i].first()->as_Register());
1234       } else if (args[i].first()->is_XMMRegister()) {
1235         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1236         __ addptr(rsp, 2*wordSize);
1237       }
1238     }
1239 }
1240 
1241 // Different signatures may require very different orders for the move
1242 // to avoid clobbering other arguments.  There's no simple way to
1243 // order them safely.  Compute a safe order for issuing stores and
1244 // break any cycles in those stores.  This code is fairly general but
1245 // it's not necessary on the other platforms so we keep it in the
1246 // platform dependent code instead of moving it into a shared file.
1247 // (See bugs 7013347 & 7145024.)
1248 // Note that this code is specific to LP64.
1249 class ComputeMoveOrder: public StackObj {
1250   class MoveOperation: public ResourceObj {
1251     friend class ComputeMoveOrder;
1252    private:
1253     VMRegPair        _src;
1254     VMRegPair        _dst;
1255     int              _src_index;
1256     int              _dst_index;
1257     bool             _processed;
1258     MoveOperation*  _next;
1259     MoveOperation*  _prev;
1260 
1261     static int get_id(VMRegPair r) {
1262       return r.first()->value();
1263     }
1264 
1265    public:
1266     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1267       _src(src)
1268     , _dst(dst)
1269     , _src_index(src_index)
1270     , _dst_index(dst_index)
1271     , _processed(false)
1272     , _next(NULL)
1273     , _prev(NULL) {
1274     }
1275 
1276     VMRegPair src() const              { return _src; }
1277     int src_id() const                 { return get_id(src()); }
1278     int src_index() const              { return _src_index; }
1279     VMRegPair dst() const              { return _dst; }
1280     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1281     int dst_index() const              { return _dst_index; }
1282     int dst_id() const                 { return get_id(dst()); }
1283     MoveOperation* next() const       { return _next; }
1284     MoveOperation* prev() const       { return _prev; }
1285     void set_processed()               { _processed = true; }
1286     bool is_processed() const          { return _processed; }
1287 
1288     // insert
1289     void break_cycle(VMRegPair temp_register) {
1290       // create a new store following the last store
1291       // to move from the temp_register to the original
1292       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1293 
1294       // break the cycle of links and insert new_store at the end
1295       // break the reverse link.
1296       MoveOperation* p = prev();
1297       assert(p->next() == this, "must be");
1298       _prev = NULL;
1299       p->_next = new_store;
1300       new_store->_prev = p;
1301 
1302       // change the original store to save it's value in the temp.
1303       set_dst(-1, temp_register);
1304     }
1305 
1306     void link(GrowableArray<MoveOperation*>& killer) {
1307       // link this store in front the store that it depends on
1308       MoveOperation* n = killer.at_grow(src_id(), NULL);
1309       if (n != NULL) {
1310         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1311         _next = n;
1312         n->_prev = this;
1313       }
1314     }
1315   };
1316 
1317  private:
1318   GrowableArray<MoveOperation*> edges;
1319 
1320  public:
1321   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1322                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1323     // Move operations where the dest is the stack can all be
1324     // scheduled first since they can't interfere with the other moves.
1325     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1326       if (in_sig_bt[i] == T_ARRAY) {
1327         c_arg--;
1328         if (out_regs[c_arg].first()->is_stack() &&
1329             out_regs[c_arg + 1].first()->is_stack()) {
1330           arg_order.push(i);
1331           arg_order.push(c_arg);
1332         } else {
1333           if (out_regs[c_arg].first()->is_stack() ||
1334               in_regs[i].first() == out_regs[c_arg].first()) {
1335             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1336           } else {
1337             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1338           }
1339         }
1340       } else if (in_sig_bt[i] == T_VOID) {
1341         arg_order.push(i);
1342         arg_order.push(c_arg);
1343       } else {
1344         if (out_regs[c_arg].first()->is_stack() ||
1345             in_regs[i].first() == out_regs[c_arg].first()) {
1346           arg_order.push(i);
1347           arg_order.push(c_arg);
1348         } else {
1349           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1350         }
1351       }
1352     }
1353     // Break any cycles in the register moves and emit the in the
1354     // proper order.
1355     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1356     for (int i = 0; i < stores->length(); i++) {
1357       arg_order.push(stores->at(i)->src_index());
1358       arg_order.push(stores->at(i)->dst_index());
1359     }
1360  }
1361 
1362   // Collected all the move operations
1363   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1364     if (src.first() == dst.first()) return;
1365     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1366   }
1367 
1368   // Walk the edges breaking cycles between moves.  The result list
1369   // can be walked in order to produce the proper set of loads
1370   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1371     // Record which moves kill which values
1372     GrowableArray<MoveOperation*> killer;
1373     for (int i = 0; i < edges.length(); i++) {
1374       MoveOperation* s = edges.at(i);
1375       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1376       killer.at_put_grow(s->dst_id(), s, NULL);
1377     }
1378     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1379            "make sure temp isn't in the registers that are killed");
1380 
1381     // create links between loads and stores
1382     for (int i = 0; i < edges.length(); i++) {
1383       edges.at(i)->link(killer);
1384     }
1385 
1386     // at this point, all the move operations are chained together
1387     // in a doubly linked list.  Processing it backwards finds
1388     // the beginning of the chain, forwards finds the end.  If there's
1389     // a cycle it can be broken at any point,  so pick an edge and walk
1390     // backward until the list ends or we end where we started.
1391     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1392     for (int e = 0; e < edges.length(); e++) {
1393       MoveOperation* s = edges.at(e);
1394       if (!s->is_processed()) {
1395         MoveOperation* start = s;
1396         // search for the beginning of the chain or cycle
1397         while (start->prev() != NULL && start->prev() != s) {
1398           start = start->prev();
1399         }
1400         if (start->prev() == s) {
1401           start->break_cycle(temp_register);
1402         }
1403         // walk the chain forward inserting to store list
1404         while (start != NULL) {
1405           stores->append(start);
1406           start->set_processed();
1407           start = start->next();
1408         }
1409       }
1410     }
1411     return stores;
1412   }
1413 };
1414 
1415 static void verify_oop_args(MacroAssembler* masm,
1416                             const methodHandle& method,
1417                             const BasicType* sig_bt,
1418                             const VMRegPair* regs) {
1419   Register temp_reg = rbx;  // not part of any compiled calling seq
1420   if (VerifyOops) {
1421     for (int i = 0; i < method->size_of_parameters(); i++) {
1422       if (is_reference_type(sig_bt[i])) {
1423         VMReg r = regs[i].first();
1424         assert(r->is_valid(), "bad oop arg");
1425         if (r->is_stack()) {
1426           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1427           __ verify_oop(temp_reg);
1428         } else {
1429           __ verify_oop(r->as_Register());
1430         }
1431       }
1432     }
1433   }
1434 }
1435 
1436 // defined in stubGenerator_x86_64.cpp
1437 OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots);
1438 void fill_continuation_entry(MacroAssembler* masm);
1439 void continuation_enter_cleanup(MacroAssembler* masm);
1440 
1441 // enterSpecial(Continuation c, boolean isContinue)
1442 // On entry: c_rarg1 -- the continuation object
1443 //           c_rarg2 -- isContinue
1444 static void gen_continuation_enter(MacroAssembler* masm,
1445                                  const methodHandle& method,
1446                                  const BasicType* sig_bt,
1447                                  const VMRegPair* regs,
1448                                  int& exception_offset,
1449                                  OopMapSet*oop_maps,
1450                                  int& frame_complete,
1451                                  int& stack_slots) {
1452   //verify_oop_args(masm, method, sig_bt, regs);
1453   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1454                          relocInfo::static_call_type);
1455 
1456   stack_slots = 2; // will be overwritten
1457   address start = __ pc();
1458 
1459   Label call_thaw, exit;
1460 
1461   __ push(rbp);
1462 
1463   //BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1464   //bs->nmethod_entry_barrier(masm);
1465   OopMap* map = continuation_enter_setup(masm, stack_slots);  // kills rax
1466 
1467   // Frame is now completed as far as size and linkage.
1468   frame_complete =__ pc() - start;
1469   // if isContinue == 0
1470   //   _enterSP = sp
1471   // end
1472  
1473   fill_continuation_entry(masm); // kills rax
1474 
1475   __ cmpl(c_rarg2, 0);
1476   __ jcc(Assembler::notEqual, call_thaw);
1477 
1478   int up = align_up((intptr_t) __ pc() + 1, 4) - (intptr_t) (__ pc() + 1);
1479   if (up > 0) {
1480     __ nop(up);
1481   }
1482 
1483   address mark = __ pc();
1484   __ call(resolve);
1485   oop_maps->add_gc_map(__ pc() - start, map);
1486   __ post_call_nop();
1487 
1488   __ jmp(exit);
1489 
1490   __ bind(call_thaw);
1491 
1492   __ movptr(rbx, (intptr_t) StubRoutines::cont_thaw());
1493   __ call(rbx);
1494   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1495   ContinuationEntry::return_pc_offset = __ pc() - start;
1496   __ post_call_nop();
1497 
1498   __ bind(exit);
1499   continuation_enter_cleanup(masm);
1500   __ pop(rbp);
1501   __ ret(0);
1502 
1503   /// exception handling
1504 
1505   exception_offset = __ pc() - start;
1506 
1507   continuation_enter_cleanup(masm);
1508   __ addptr(rsp, 1*wordSize);
1509 
1510   __ movptr(rbx, rax); // save the exception
1511   __ movptr(c_rarg0, Address(rsp, 0));
1512 
1513   __ call_VM_leaf(CAST_FROM_FN_PTR(address,
1514         SharedRuntime::exception_handler_for_return_address),
1515       r15_thread, c_rarg0);
1516   __ mov(rdi, rax);
1517   __ movptr(rax, rbx);
1518   __ mov(rbx, rdi);
1519   __ pop(rdx);
1520 
1521   // continue at exception handler (return address removed)
1522   // rax: exception
1523   // rbx: exception handler
1524   // rdx: throwing pc
1525   __ verify_oop(rax);
1526   __ jmp(rbx);
1527 
1528   CodeBuffer* cbuf = masm->code_section()->outer();
1529   address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, mark);
1530 }
1531 
1532 static void gen_special_dispatch(MacroAssembler* masm,
1533                                  const methodHandle& method,
1534                                  const BasicType* sig_bt,
1535                                  const VMRegPair* regs) {
1536   verify_oop_args(masm, method, sig_bt, regs);
1537   vmIntrinsics::ID iid = method->intrinsic_id();
1538 
1539   // Now write the args into the outgoing interpreter space
1540   bool     has_receiver   = false;
1541   Register receiver_reg   = noreg;
1542   int      member_arg_pos = -1;
1543   Register member_reg     = noreg;
1544   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1545   if (ref_kind != 0) {
1546     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1547     member_reg = rbx;  // known to be free at this point
1548     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1549   } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1550     has_receiver = true;
1551   } else {
1552     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1553   }
1554 
1555   if (member_reg != noreg) {
1556     // Load the member_arg into register, if necessary.
1557     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1558     VMReg r = regs[member_arg_pos].first();
1559     if (r->is_stack()) {
1560       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1561     } else {
1562       // no data motion is needed
1563       member_reg = r->as_Register();
1564     }
1565   }
1566 
1567   if (has_receiver) {
1568     // Make sure the receiver is loaded into a register.
1569     assert(method->size_of_parameters() > 0, "oob");
1570     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1571     VMReg r = regs[0].first();
1572     assert(r->is_valid(), "bad receiver arg");
1573     if (r->is_stack()) {
1574       // Porting note:  This assumes that compiled calling conventions always
1575       // pass the receiver oop in a register.  If this is not true on some
1576       // platform, pick a temp and load the receiver from stack.
1577       fatal("receiver always in a register");
1578       receiver_reg = j_rarg0;  // known to be free at this point
1579       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1580     } else {
1581       // no data motion is needed
1582       receiver_reg = r->as_Register();
1583     }
1584   }
1585 
1586   // Figure out which address we are really jumping to:
1587   MethodHandles::generate_method_handle_dispatch(masm, iid,
1588                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1589 }
1590 
1591 // ---------------------------------------------------------------------------
1592 // Generate a native wrapper for a given method.  The method takes arguments
1593 // in the Java compiled code convention, marshals them to the native
1594 // convention (handlizes oops, etc), transitions to native, makes the call,
1595 // returns to java state (possibly blocking), unhandlizes any result and
1596 // returns.
1597 //
1598 // Critical native functions are a shorthand for the use of
1599 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1600 // functions.  The wrapper is expected to unpack the arguments before
1601 // passing them to the callee. Critical native functions leave the state _in_Java,
1602 // since they cannot stop for GC.
1603 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1604 // block and the check for pending exceptions it's impossible for them
1605 // to be thrown.
1606 //
1607 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1608                                                 const methodHandle& method,
1609                                                 int compile_id,
1610                                                 BasicType* in_sig_bt,
1611                                                 VMRegPair* in_regs,
1612                                                 BasicType ret_type) {
1613   if (method->is_continuation_enter_intrinsic()) {
1614     vmIntrinsics::ID iid = method->intrinsic_id();
1615     intptr_t start = (intptr_t)__ pc();
1616     int vep_offset = ((intptr_t)__ pc()) - start;
1617     int exception_offset = 0;
1618     int frame_complete = 0;
1619     int stack_slots = 0;
1620     OopMapSet* oop_maps =  new OopMapSet();
1621     gen_continuation_enter(masm,
1622                          method,
1623                          in_sig_bt,
1624                          in_regs,
1625                          exception_offset,
1626                          oop_maps,
1627                          frame_complete,
1628                          stack_slots);
1629     __ flush();
1630     nmethod* nm = nmethod::new_native_nmethod(method,
1631                                               compile_id,
1632                                               masm->code(),
1633                                               vep_offset,
1634                                               frame_complete,
1635                                               stack_slots,
1636                                               in_ByteSize(-1),
1637                                               in_ByteSize(-1),
1638                                               oop_maps,
1639                                               exception_offset);
1640     ContinuationEntry::set_enter_nmethod(nm);
1641     return nm;
1642   }
1643 
1644   if (method->is_method_handle_intrinsic()) {
1645     vmIntrinsics::ID iid = method->intrinsic_id();
1646     intptr_t start = (intptr_t)__ pc();
1647     int vep_offset = ((intptr_t)__ pc()) - start;
1648     gen_special_dispatch(masm,
1649                          method,
1650                          in_sig_bt,
1651                          in_regs);
1652     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1653     __ flush();
1654     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1655     return nmethod::new_native_nmethod(method,
1656                                        compile_id,
1657                                        masm->code(),
1658                                        vep_offset,
1659                                        frame_complete,
1660                                        stack_slots / VMRegImpl::slots_per_word,
1661                                        in_ByteSize(-1),
1662                                        in_ByteSize(-1),
1663                                        (OopMapSet*)NULL);
1664   }
1665   address native_func = method->native_function();
1666   assert(native_func != NULL, "must have function");
1667 
1668   // An OopMap for lock (and class if static)
1669   OopMapSet *oop_maps = new OopMapSet();
1670   intptr_t start = (intptr_t)__ pc();
1671 
1672   // We have received a description of where all the java arg are located
1673   // on entry to the wrapper. We need to convert these args to where
1674   // the jni function will expect them. To figure out where they go
1675   // we convert the java signature to a C signature by inserting
1676   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1677 
1678   const int total_in_args = method->size_of_parameters();
1679   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1680 
1681   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1682   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1683   BasicType* in_elem_bt = NULL;
1684 
1685   int argc = 0;
1686   out_sig_bt[argc++] = T_ADDRESS;
1687   if (method->is_static()) {
1688     out_sig_bt[argc++] = T_OBJECT;
1689   }
1690 
1691   for (int i = 0; i < total_in_args ; i++ ) {
1692     out_sig_bt[argc++] = in_sig_bt[i];
1693   }
1694 
1695   // Now figure out where the args must be stored and how much stack space
1696   // they require.
1697   int out_arg_slots;
1698   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1699 
1700   // Compute framesize for the wrapper.  We need to handlize all oops in
1701   // incoming registers
1702 
1703   // Calculate the total number of stack slots we will need.
1704 
1705   // First count the abi requirement plus all of the outgoing args
1706   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1707 
1708   // Now the space for the inbound oop handle area
1709   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1710 
1711   int oop_handle_offset = stack_slots;
1712   stack_slots += total_save_slots;
1713 
1714   // Now any space we need for handlizing a klass if static method
1715 
1716   int klass_slot_offset = 0;
1717   int klass_offset = -1;
1718   int lock_slot_offset = 0;
1719   bool is_static = false;
1720 
1721   if (method->is_static()) {
1722     klass_slot_offset = stack_slots;
1723     stack_slots += VMRegImpl::slots_per_word;
1724     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1725     is_static = true;
1726   }
1727 
1728   // Plus a lock if needed
1729 
1730   if (method->is_synchronized()) {
1731     lock_slot_offset = stack_slots;
1732     stack_slots += VMRegImpl::slots_per_word;
1733   }
1734 
1735   // Now a place (+2) to save return values or temp during shuffling
1736   // + 4 for return address (which we own) and saved rbp
1737   stack_slots += 6;
1738 
1739   // Ok The space we have allocated will look like:
1740   //
1741   //
1742   // FP-> |                     |
1743   //      |---------------------|
1744   //      | 2 slots for moves   |
1745   //      |---------------------|
1746   //      | lock box (if sync)  |
1747   //      |---------------------| <- lock_slot_offset
1748   //      | klass (if static)   |
1749   //      |---------------------| <- klass_slot_offset
1750   //      | oopHandle area      |
1751   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1752   //      | outbound memory     |
1753   //      | based arguments     |
1754   //      |                     |
1755   //      |---------------------|
1756   //      |                     |
1757   // SP-> | out_preserved_slots |
1758   //
1759   //
1760 
1761 
1762   // Now compute actual number of stack words we need rounding to make
1763   // stack properly aligned.
1764   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1765 
1766   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1767 
1768   // First thing make an ic check to see if we should even be here
1769 
1770   // We are free to use all registers as temps without saving them and
1771   // restoring them except rbp. rbp is the only callee save register
1772   // as far as the interpreter and the compiler(s) are concerned.
1773 
1774 
1775   const Register ic_reg = rax;
1776   const Register receiver = j_rarg0;
1777 
1778   Label hit;
1779   Label exception_pending;
1780 
1781   assert_different_registers(ic_reg, receiver, rscratch1);
1782   __ verify_oop(receiver);
1783   __ load_klass(rscratch1, receiver, rscratch2);
1784   __ cmpq(ic_reg, rscratch1);
1785   __ jcc(Assembler::equal, hit);
1786 
1787   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1788 
1789   // Verified entry point must be aligned
1790   __ align(8);
1791 
1792   __ bind(hit);
1793 
1794   int vep_offset = ((intptr_t)__ pc()) - start;
1795 
1796   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1797     Label L_skip_barrier;
1798     Register klass = r10;
1799     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1800     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1801 
1802     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1803 
1804     __ bind(L_skip_barrier);
1805   }
1806 
1807 #ifdef COMPILER1
1808   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1809   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1810     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1811   }
1812 #endif // COMPILER1
1813 
1814   // The instruction at the verified entry point must be 5 bytes or longer
1815   // because it can be patched on the fly by make_non_entrant. The stack bang
1816   // instruction fits that requirement.
1817 
1818   // Generate stack overflow check
1819   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1820 
1821   // Generate a new frame for the wrapper.
1822   __ enter();
1823   // -2 because return address is already present and so is saved rbp
1824   __ subptr(rsp, stack_size - 2*wordSize);
1825 
1826   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1827   bs->nmethod_entry_barrier(masm);
1828 
1829   // Frame is now completed as far as size and linkage.
1830   int frame_complete = ((intptr_t)__ pc()) - start;
1831 
1832     if (UseRTMLocking) {
1833       // Abort RTM transaction before calling JNI
1834       // because critical section will be large and will be
1835       // aborted anyway. Also nmethod could be deoptimized.
1836       __ xabort(0);
1837     }
1838 
1839 #ifdef ASSERT
1840     {
1841       Label L;
1842       __ mov(rax, rsp);
1843       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1844       __ cmpptr(rax, rsp);
1845       __ jcc(Assembler::equal, L);
1846       __ stop("improperly aligned stack");
1847       __ bind(L);
1848     }
1849 #endif /* ASSERT */
1850 
1851 
1852   // We use r14 as the oop handle for the receiver/klass
1853   // It is callee save so it survives the call to native
1854 
1855   const Register oop_handle_reg = r14;
1856 
1857   //
1858   // We immediately shuffle the arguments so that any vm call we have to
1859   // make from here on out (sync slow path, jvmti, etc.) we will have
1860   // captured the oops from our caller and have a valid oopMap for
1861   // them.
1862 
1863   // -----------------
1864   // The Grand Shuffle
1865 
1866   // The Java calling convention is either equal (linux) or denser (win64) than the
1867   // c calling convention. However the because of the jni_env argument the c calling
1868   // convention always has at least one more (and two for static) arguments than Java.
1869   // Therefore if we move the args from java -> c backwards then we will never have
1870   // a register->register conflict and we don't have to build a dependency graph
1871   // and figure out how to break any cycles.
1872   //
1873 
1874   // Record esp-based slot for receiver on stack for non-static methods
1875   int receiver_offset = -1;
1876 
1877   // This is a trick. We double the stack slots so we can claim
1878   // the oops in the caller's frame. Since we are sure to have
1879   // more args than the caller doubling is enough to make
1880   // sure we can capture all the incoming oop args from the
1881   // caller.
1882   //
1883   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1884 
1885   // Mark location of rbp (someday)
1886   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1887 
1888   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1889   // All inbound args are referenced based on rbp and all outbound args via rsp.
1890 
1891 
1892 #ifdef ASSERT
1893   bool reg_destroyed[RegisterImpl::number_of_registers];
1894   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1895   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1896     reg_destroyed[r] = false;
1897   }
1898   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1899     freg_destroyed[f] = false;
1900   }
1901 
1902 #endif /* ASSERT */
1903 
1904   // For JNI natives the incoming and outgoing registers are offset upwards.
1905   GrowableArray<int> arg_order(2 * total_in_args);
1906 
1907   VMRegPair tmp_vmreg;
1908   tmp_vmreg.set2(rbx->as_VMReg());
1909 
1910   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1911     arg_order.push(i);
1912     arg_order.push(c_arg);
1913   }
1914 
1915   int temploc = -1;
1916   for (int ai = 0; ai < arg_order.length(); ai += 2) {
1917     int i = arg_order.at(ai);
1918     int c_arg = arg_order.at(ai + 1);
1919     __ block_comment(err_msg("move %d -> %d", i, c_arg));
1920 #ifdef ASSERT
1921     if (in_regs[i].first()->is_Register()) {
1922       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1923     } else if (in_regs[i].first()->is_XMMRegister()) {
1924       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1925     }
1926     if (out_regs[c_arg].first()->is_Register()) {
1927       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1928     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1929       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1930     }
1931 #endif /* ASSERT */
1932     switch (in_sig_bt[i]) {
1933       case T_ARRAY:
1934       case T_OBJECT:
1935         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1936                     ((i == 0) && (!is_static)),
1937                     &receiver_offset);
1938         break;
1939       case T_VOID:
1940         break;
1941 
1942       case T_FLOAT:
1943         __ float_move(in_regs[i], out_regs[c_arg]);
1944           break;
1945 
1946       case T_DOUBLE:
1947         assert( i + 1 < total_in_args &&
1948                 in_sig_bt[i + 1] == T_VOID &&
1949                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1950         __ double_move(in_regs[i], out_regs[c_arg]);
1951         break;
1952 
1953       case T_LONG :
1954         __ long_move(in_regs[i], out_regs[c_arg]);
1955         break;
1956 
1957       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1958 
1959       default:
1960         __ move32_64(in_regs[i], out_regs[c_arg]);
1961     }
1962   }
1963 
1964   int c_arg;
1965 
1966   // Pre-load a static method's oop into r14.  Used both by locking code and
1967   // the normal JNI call code.
1968   // point c_arg at the first arg that is already loaded in case we
1969   // need to spill before we call out
1970   c_arg = total_c_args - total_in_args;
1971 
1972   if (method->is_static()) {
1973 
1974     //  load oop into a register
1975     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1976 
1977     // Now handlize the static class mirror it's known not-null.
1978     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1979     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1980 
1981     // Now get the handle
1982     __ lea(oop_handle_reg, Address(rsp, klass_offset));
1983     // store the klass handle as second argument
1984     __ movptr(c_rarg1, oop_handle_reg);
1985     // and protect the arg if we must spill
1986     c_arg--;
1987   }
1988 
1989   // Change state to native (we save the return address in the thread, since it might not
1990   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
1991   // points into the right code segment. It does not have to be the correct return pc.
1992   // We use the same pc/oopMap repeatedly when we call out
1993 
1994   intptr_t the_pc = (intptr_t) __ pc();
1995   oop_maps->add_gc_map(the_pc - start, map);
1996 
1997   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
1998 
1999 
2000   // We have all of the arguments setup at this point. We must not touch any register
2001   // argument registers at this point (what if we save/restore them there are no oop?
2002 
2003   {
2004     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2005     // protect the args we've loaded
2006     save_args(masm, total_c_args, c_arg, out_regs);
2007     __ mov_metadata(c_rarg1, method());
2008     __ call_VM_leaf(
2009       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2010       r15_thread, c_rarg1);
2011     restore_args(masm, total_c_args, c_arg, out_regs);
2012   }
2013 
2014   // RedefineClasses() tracing support for obsolete method entry
2015   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2016     // protect the args we've loaded
2017     save_args(masm, total_c_args, c_arg, out_regs);
2018     __ mov_metadata(c_rarg1, method());
2019     __ call_VM_leaf(
2020       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2021       r15_thread, c_rarg1);
2022     restore_args(masm, total_c_args, c_arg, out_regs);
2023   }
2024 
2025   // Lock a synchronized method
2026 
2027   // Register definitions used by locking and unlocking
2028 
2029   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2030   const Register obj_reg  = rbx;  // Will contain the oop
2031   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2032   const Register old_hdr  = r13;  // value of old header at unlock time
2033 
2034   Label slow_path_lock;
2035   Label lock_done;
2036 
2037   if (method->is_synchronized()) {
2038 
2039     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2040 
2041     // Get the handle (the 2nd argument)
2042     __ mov(oop_handle_reg, c_rarg1);
2043 
2044     // Get address of the box
2045 
2046     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2047 
2048     // Load the oop from the handle
2049     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2050 
2051     // Load immediate 1 into swap_reg %rax
2052     __ movl(swap_reg, 1);
2053 
2054     // Load (object->mark() | 1) into swap_reg %rax
2055     __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2056 
2057     // Save (object->mark() | 1) into BasicLock's displaced header
2058     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2059 
2060     // src -> dest iff dest == rax else rax <- dest
2061     __ lock();
2062     __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2063     __ jcc(Assembler::equal, lock_done);
2064 
2065     // Hmm should this move to the slow path code area???
2066 
2067     // Test if the oopMark is an obvious stack pointer, i.e.,
2068     //  1) (mark & 3) == 0, and
2069     //  2) rsp <= mark < mark + os::pagesize()
2070     // These 3 tests can be done by evaluating the following
2071     // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2072     // assuming both stack pointer and pagesize have their
2073     // least significant 2 bits clear.
2074     // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2075 
2076     __ subptr(swap_reg, rsp);
2077     __ andptr(swap_reg, 3 - os::vm_page_size());
2078 
2079     // Save the test result, for recursive case, the result is zero
2080     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2081     __ jcc(Assembler::notEqual, slow_path_lock);
2082 
2083     // Slow path will re-enter here

2084     __ bind(lock_done);
2085     // __ inc_held_monitor_count(r15_thread);
2086   }
2087 
2088   // Finally just about ready to make the JNI call
2089 
2090   // get JNIEnv* which is first argument to native
2091   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2092 
2093   // Now set thread in native
2094   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2095 
2096   __ call(RuntimeAddress(native_func));
2097 
2098   // Verify or restore cpu control state after JNI call
2099   __ restore_cpu_control_state_after_jni();
2100 
2101   // Unpack native results.
2102   switch (ret_type) {
2103   case T_BOOLEAN: __ c2bool(rax);            break;
2104   case T_CHAR   : __ movzwl(rax, rax);      break;
2105   case T_BYTE   : __ sign_extend_byte (rax); break;
2106   case T_SHORT  : __ sign_extend_short(rax); break;
2107   case T_INT    : /* nothing to do */        break;
2108   case T_DOUBLE :
2109   case T_FLOAT  :
2110     // Result is in xmm0 we'll save as needed
2111     break;
2112   case T_ARRAY:                 // Really a handle
2113   case T_OBJECT:                // Really a handle
2114       break; // can't de-handlize until after safepoint check
2115   case T_VOID: break;
2116   case T_LONG: break;
2117   default       : ShouldNotReachHere();
2118   }
2119 
2120   Label after_transition;
2121 
2122   // Switch thread to "native transition" state before reading the synchronization state.
2123   // This additional state is necessary because reading and testing the synchronization
2124   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2125   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2126   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2127   //     Thread A is resumed to finish this native method, but doesn't block here since it
2128   //     didn't see any synchronization is progress, and escapes.
2129   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2130 
2131   // Force this write out before the read below
2132   __ membar(Assembler::Membar_mask_bits(
2133               Assembler::LoadLoad | Assembler::LoadStore |
2134               Assembler::StoreLoad | Assembler::StoreStore));
2135 
2136   // check for safepoint operation in progress and/or pending suspend requests
2137   {
2138     Label Continue;
2139     Label slow_path;
2140 
2141     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2142 
2143     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2144     __ jcc(Assembler::equal, Continue);
2145     __ bind(slow_path);
2146 
2147     // Don't use call_VM as it will see a possible pending exception and forward it
2148     // and never return here preventing us from clearing _last_native_pc down below.
2149     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2150     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2151     // by hand.
2152     //
2153     __ vzeroupper();
2154     save_native_result(masm, ret_type, stack_slots);
2155     __ mov(c_rarg0, r15_thread);
2156     __ mov(r12, rsp); // remember sp
2157     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2158     __ andptr(rsp, -16); // align stack as required by ABI
2159     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2160     __ mov(rsp, r12); // restore sp
2161     __ reinit_heapbase();
2162     // Restore any method result value
2163     restore_native_result(masm, ret_type, stack_slots);
2164     __ bind(Continue);
2165   }
2166 
2167   // change thread state
2168   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2169   __ bind(after_transition);
2170 
2171   Label reguard;
2172   Label reguard_done;
2173   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2174   __ jcc(Assembler::equal, reguard);
2175   __ bind(reguard_done);
2176 
2177   // native result if any is live
2178 
2179   // Unlock
2180   Label unlock_done;
2181   Label slow_path_unlock;
2182   if (method->is_synchronized()) {
2183 
2184     // Get locked oop from the handle we passed to jni
2185     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2186 
2187     Label done;
2188     // Simple recursive lock?
2189 
2190     __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2191     __ jcc(Assembler::equal, done);
2192 
2193     // Must save rax if if it is live now because cmpxchg must use it
2194     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2195       save_native_result(masm, ret_type, stack_slots);
2196     }
2197 
2198 
2199     // get address of the stack lock
2200     __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2201     //  get old displaced header
2202     __ movptr(old_hdr, Address(rax, 0));
2203 
2204     // Atomic swap old header if oop still contains the stack lock
2205     __ lock();
2206     __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2207     __ jcc(Assembler::notEqual, slow_path_unlock);
2208 
2209     // slow path re-enters here
2210     __ bind(unlock_done);
2211     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2212       restore_native_result(masm, ret_type, stack_slots);
2213     }
2214 
2215     __ bind(done);
2216     // __ dec_held_monitor_count(r15_thread);
2217   }
2218   {
2219     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2220     save_native_result(masm, ret_type, stack_slots);
2221     __ mov_metadata(c_rarg1, method());
2222     __ call_VM_leaf(
2223          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2224          r15_thread, c_rarg1);
2225     restore_native_result(masm, ret_type, stack_slots);
2226   }
2227 
2228   __ reset_last_Java_frame(false);
2229 
2230   // Unbox oop result, e.g. JNIHandles::resolve value.
2231   if (is_reference_type(ret_type)) {
2232     __ resolve_jobject(rax /* value */,
2233                        r15_thread /* thread */,
2234                        rcx /* tmp */);
2235   }
2236 
2237   if (CheckJNICalls) {
2238     // clear_pending_jni_exception_check
2239     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2240   }
2241 
2242   // reset handle block
2243   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2244   __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2245 
2246   // pop our frame
2247 
2248   __ leave();
2249 
2250   // Any exception pending?
2251   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2252   __ jcc(Assembler::notEqual, exception_pending);
2253 
2254   // Return
2255 
2256   __ ret(0);
2257 
2258   // Unexpected paths are out of line and go here
2259 
2260   // forward the exception
2261   __ bind(exception_pending);
2262 
2263   // and forward the exception
2264   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2265 
2266   // Slow path locking & unlocking
2267   if (method->is_synchronized()) {
2268 
2269     // BEGIN Slow path lock
2270     __ bind(slow_path_lock);
2271 
2272     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2273     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2274 
2275     // protect the args we've loaded
2276     save_args(masm, total_c_args, c_arg, out_regs);
2277 
2278     __ mov(c_rarg0, obj_reg);
2279     __ mov(c_rarg1, lock_reg);
2280     __ mov(c_rarg2, r15_thread);
2281 
2282     // Not a leaf but we have last_Java_frame setup as we want
2283     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2284     restore_args(masm, total_c_args, c_arg, out_regs);
2285 
2286 #ifdef ASSERT
2287     { Label L;
2288     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2289     __ jcc(Assembler::equal, L);
2290     __ stop("no pending exception allowed on exit from monitorenter");
2291     __ bind(L);
2292     }
2293 #endif
2294     __ jmp(lock_done);
2295 
2296     // END Slow path lock
2297 
2298     // BEGIN Slow path unlock
2299     __ bind(slow_path_unlock);
2300 
2301     // If we haven't already saved the native result we must save it now as xmm registers
2302     // are still exposed.
2303     __ vzeroupper();
2304     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2305       save_native_result(masm, ret_type, stack_slots);
2306     }
2307 
2308     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2309 
2310     __ mov(c_rarg0, obj_reg);
2311     __ mov(c_rarg2, r15_thread);
2312     __ mov(r12, rsp); // remember sp
2313     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2314     __ andptr(rsp, -16); // align stack as required by ABI
2315 
2316     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2317     // NOTE that obj_reg == rbx currently
2318     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2319     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2320 
2321     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2322     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2323     __ mov(rsp, r12); // restore sp
2324     __ reinit_heapbase();
2325 #ifdef ASSERT
2326     {
2327       Label L;
2328       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2329       __ jcc(Assembler::equal, L);
2330       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2331       __ bind(L);
2332     }
2333 #endif /* ASSERT */
2334 
2335     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2336 
2337     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2338       restore_native_result(masm, ret_type, stack_slots);
2339     }
2340     __ jmp(unlock_done);
2341 
2342     // END Slow path unlock
2343 
2344   } // synchronized
2345 
2346   // SLOW PATH Reguard the stack if needed
2347 
2348   __ bind(reguard);
2349   __ vzeroupper();
2350   save_native_result(masm, ret_type, stack_slots);
2351   __ mov(r12, rsp); // remember sp
2352   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2353   __ andptr(rsp, -16); // align stack as required by ABI
2354   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2355   __ mov(rsp, r12); // restore sp
2356   __ reinit_heapbase();
2357   restore_native_result(masm, ret_type, stack_slots);
2358   // and continue
2359   __ jmp(reguard_done);
2360 
2361 
2362 
2363   __ flush();
2364 
2365   nmethod *nm = nmethod::new_native_nmethod(method,
2366                                             compile_id,
2367                                             masm->code(),
2368                                             vep_offset,
2369                                             frame_complete,
2370                                             stack_slots / VMRegImpl::slots_per_word,
2371                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2372                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2373                                             oop_maps);
2374 
2375   return nm;
2376 }
2377 
2378 // this function returns the adjust size (in number of words) to a c2i adapter
2379 // activation for use during deoptimization
2380 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2381   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2382 }
2383 
2384 
2385 uint SharedRuntime::out_preserve_stack_slots() {
2386   return 0;
2387 }
2388 
2389 
2390 // Number of stack slots between incoming argument block and the start of
2391 // a new frame.  The PROLOG must add this many slots to the stack.  The
2392 // EPILOG must remove this many slots.  amd64 needs two slots for
2393 // return address.
2394 uint SharedRuntime::in_preserve_stack_slots() {
2395   return 4 + 2 * VerifyStackAtCalls;
2396 }
2397 
2398 //------------------------------generate_deopt_blob----------------------------
2399 void SharedRuntime::generate_deopt_blob() {
2400   // Allocate space for the code
2401   ResourceMark rm;
2402   // Setup code generation tools
2403   int pad = 0;
2404   if (UseAVX > 2) {
2405     pad += 1024;
2406   }
2407 #if INCLUDE_JVMCI
2408   if (EnableJVMCI) {
2409     pad += 512; // Increase the buffer size when compiling for JVMCI
2410   }
2411 #endif
2412   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2413   MacroAssembler* masm = new MacroAssembler(&buffer);
2414   int frame_size_in_words;
2415   OopMap* map = NULL;
2416   OopMapSet *oop_maps = new OopMapSet();
2417 
2418   // -------------
2419   // This code enters when returning to a de-optimized nmethod.  A return
2420   // address has been pushed on the the stack, and return values are in
2421   // registers.
2422   // If we are doing a normal deopt then we were called from the patched
2423   // nmethod from the point we returned to the nmethod. So the return
2424   // address on the stack is wrong by NativeCall::instruction_size
2425   // We will adjust the value so it looks like we have the original return
2426   // address on the stack (like when we eagerly deoptimized).
2427   // In the case of an exception pending when deoptimizing, we enter
2428   // with a return address on the stack that points after the call we patched
2429   // into the exception handler. We have the following register state from,
2430   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2431   //    rax: exception oop
2432   //    rbx: exception handler
2433   //    rdx: throwing pc
2434   // So in this case we simply jam rdx into the useless return address and
2435   // the stack looks just like we want.
2436   //
2437   // At this point we need to de-opt.  We save the argument return
2438   // registers.  We call the first C routine, fetch_unroll_info().  This
2439   // routine captures the return values and returns a structure which
2440   // describes the current frame size and the sizes of all replacement frames.
2441   // The current frame is compiled code and may contain many inlined
2442   // functions, each with their own JVM state.  We pop the current frame, then
2443   // push all the new frames.  Then we call the C routine unpack_frames() to
2444   // populate these frames.  Finally unpack_frames() returns us the new target
2445   // address.  Notice that callee-save registers are BLOWN here; they have
2446   // already been captured in the vframeArray at the time the return PC was
2447   // patched.
2448   address start = __ pc();
2449   Label cont;
2450 
2451   // Prolog for non exception case!
2452 
2453   // Save everything in sight.
2454   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2455 
2456   // Normal deoptimization.  Save exec mode for unpack_frames.
2457   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2458   __ jmp(cont);
2459 
2460   int reexecute_offset = __ pc() - start;
2461 #if INCLUDE_JVMCI && !defined(COMPILER1)
2462   if (EnableJVMCI && UseJVMCICompiler) {
2463     // JVMCI does not use this kind of deoptimization
2464     __ should_not_reach_here();
2465   }
2466 #endif
2467 
2468   // Reexecute case
2469   // return address is the pc describes what bci to do re-execute at
2470 
2471   // No need to update map as each call to save_live_registers will produce identical oopmap
2472   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2473 
2474   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2475   __ jmp(cont);
2476 
2477 #if INCLUDE_JVMCI
2478   Label after_fetch_unroll_info_call;
2479   int implicit_exception_uncommon_trap_offset = 0;
2480   int uncommon_trap_offset = 0;
2481 
2482   if (EnableJVMCI) {
2483     implicit_exception_uncommon_trap_offset = __ pc() - start;
2484 
2485     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2486     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2487 
2488     uncommon_trap_offset = __ pc() - start;
2489 
2490     // Save everything in sight.
2491     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2492     // fetch_unroll_info needs to call last_java_frame()
2493     __ set_last_Java_frame(noreg, noreg, NULL);
2494 
2495     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2496     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2497 
2498     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2499     __ mov(c_rarg0, r15_thread);
2500     __ movl(c_rarg2, r14); // exec mode
2501     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2502     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2503 
2504     __ reset_last_Java_frame(false);
2505 
2506     __ jmp(after_fetch_unroll_info_call);
2507   } // EnableJVMCI
2508 #endif // INCLUDE_JVMCI
2509 
2510   int exception_offset = __ pc() - start;
2511 
2512   // Prolog for exception case
2513 
2514   // all registers are dead at this entry point, except for rax, and
2515   // rdx which contain the exception oop and exception pc
2516   // respectively.  Set them in TLS and fall thru to the
2517   // unpack_with_exception_in_tls entry point.
2518 
2519   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2520   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2521 
2522   int exception_in_tls_offset = __ pc() - start;
2523 
2524   // new implementation because exception oop is now passed in JavaThread
2525 
2526   // Prolog for exception case
2527   // All registers must be preserved because they might be used by LinearScan
2528   // Exceptiop oop and throwing PC are passed in JavaThread
2529   // tos: stack at point of call to method that threw the exception (i.e. only
2530   // args are on the stack, no return address)
2531 
2532   // make room on stack for the return address
2533   // It will be patched later with the throwing pc. The correct value is not
2534   // available now because loading it from memory would destroy registers.
2535   __ push(0);
2536 
2537   // Save everything in sight.
2538   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2539 
2540   // Now it is safe to overwrite any register
2541 
2542   // Deopt during an exception.  Save exec mode for unpack_frames.
2543   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2544 
2545   // load throwing pc from JavaThread and patch it as the return address
2546   // of the current frame. Then clear the field in JavaThread
2547 
2548   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2549   __ movptr(Address(rbp, wordSize), rdx);
2550   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2551 
2552 #ifdef ASSERT
2553   // verify that there is really an exception oop in JavaThread
2554   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2555   __ verify_oop(rax);
2556 
2557   // verify that there is no pending exception
2558   Label no_pending_exception;
2559   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2560   __ testptr(rax, rax);
2561   __ jcc(Assembler::zero, no_pending_exception);
2562   __ stop("must not have pending exception here");
2563   __ bind(no_pending_exception);
2564 #endif
2565 
2566   __ bind(cont);
2567 
2568   // Call C code.  Need thread and this frame, but NOT official VM entry
2569   // crud.  We cannot block on this call, no GC can happen.
2570   //
2571   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2572 
2573   // fetch_unroll_info needs to call last_java_frame().
2574 
2575   __ set_last_Java_frame(noreg, noreg, NULL);
2576 #ifdef ASSERT
2577   { Label L;
2578     __ cmpptr(Address(r15_thread,
2579                     JavaThread::last_Java_fp_offset()),
2580             (int32_t)0);
2581     __ jcc(Assembler::equal, L);
2582     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2583     __ bind(L);
2584   }
2585 #endif // ASSERT
2586   __ mov(c_rarg0, r15_thread);
2587   __ movl(c_rarg1, r14); // exec_mode
2588   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2589 
2590   // Need to have an oopmap that tells fetch_unroll_info where to
2591   // find any register it might need.
2592   oop_maps->add_gc_map(__ pc() - start, map);
2593 
2594   __ reset_last_Java_frame(false);
2595 
2596 #if INCLUDE_JVMCI
2597   if (EnableJVMCI) {
2598     __ bind(after_fetch_unroll_info_call);
2599   }
2600 #endif
2601 
2602   // Load UnrollBlock* into rdi
2603   __ mov(rdi, rax);
2604 
2605   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2606    Label noException;
2607   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2608   __ jcc(Assembler::notEqual, noException);
2609   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2610   // QQQ this is useless it was NULL above
2611   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2612   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2613   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2614 
2615   __ verify_oop(rax);
2616 
2617   // Overwrite the result registers with the exception results.
2618   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2619   // I think this is useless
2620   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2621 
2622   __ bind(noException);
2623 
2624   // Only register save data is on the stack.
2625   // Now restore the result registers.  Everything else is either dead
2626   // or captured in the vframeArray.
2627   RegisterSaver::restore_result_registers(masm);
2628 
2629   // All of the register save area has been popped of the stack. Only the
2630   // return address remains.
2631 
2632   // Pop all the frames we must move/replace.
2633   //
2634   // Frame picture (youngest to oldest)
2635   // 1: self-frame (no frame link)
2636   // 2: deopting frame  (no frame link)
2637   // 3: caller of deopting frame (could be compiled/interpreted).
2638   //
2639   // Note: by leaving the return address of self-frame on the stack
2640   // and using the size of frame 2 to adjust the stack
2641   // when we are done the return to frame 3 will still be on the stack.
2642 
2643   // Pop deoptimized frame
2644   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2645   __ addptr(rsp, rcx);
2646 
2647   // rsp should be pointing at the return address to the caller (3)
2648 
2649   // Pick up the initial fp we should save
2650   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2651   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2652 
2653 #ifdef ASSERT
2654   // Compilers generate code that bang the stack by as much as the
2655   // interpreter would need. So this stack banging should never
2656   // trigger a fault. Verify that it does not on non product builds.
2657   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2658   __ bang_stack_size(rbx, rcx);
2659 #endif
2660 
2661   // Load address of array of frame pcs into rcx
2662   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2663 
2664   // Trash the old pc
2665   __ addptr(rsp, wordSize);
2666 
2667   // Load address of array of frame sizes into rsi
2668   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2669 
2670   // Load counter into rdx
2671   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2672 
2673   // Now adjust the caller's stack to make up for the extra locals
2674   // but record the original sp so that we can save it in the skeletal interpreter
2675   // frame and the stack walking of interpreter_sender will get the unextended sp
2676   // value and not the "real" sp value.
2677 
2678   const Register sender_sp = r8;
2679 
2680   __ mov(sender_sp, rsp);
2681   __ movl(rbx, Address(rdi,
2682                        Deoptimization::UnrollBlock::
2683                        caller_adjustment_offset_in_bytes()));
2684   __ subptr(rsp, rbx);
2685 
2686   // Push interpreter frames in a loop
2687   Label loop;
2688   __ bind(loop);
2689   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2690   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2691   __ pushptr(Address(rcx, 0));          // Save return address
2692   __ enter();                           // Save old & set new ebp
2693   __ subptr(rsp, rbx);                  // Prolog
2694   // This value is corrected by layout_activation_impl
2695   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2696   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2697   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2698   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2699   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2700   __ decrementl(rdx);                   // Decrement counter
2701   __ jcc(Assembler::notZero, loop);
2702   __ pushptr(Address(rcx, 0));          // Save final return address
2703 
2704   // Re-push self-frame
2705   __ enter();                           // Save old & set new ebp
2706 
2707   // Allocate a full sized register save area.
2708   // Return address and rbp are in place, so we allocate two less words.
2709   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2710 
2711   // Restore frame locals after moving the frame
2712   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2713   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2714 
2715   // Call C code.  Need thread but NOT official VM entry
2716   // crud.  We cannot block on this call, no GC can happen.  Call should
2717   // restore return values to their stack-slots with the new SP.
2718   //
2719   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2720 
2721   // Use rbp because the frames look interpreted now
2722   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2723   // Don't need the precise return PC here, just precise enough to point into this code blob.
2724   address the_pc = __ pc();
2725   __ set_last_Java_frame(noreg, rbp, the_pc);
2726 
2727   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2728   __ mov(c_rarg0, r15_thread);
2729   __ movl(c_rarg1, r14); // second arg: exec_mode
2730   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2731   // Revert SP alignment after call since we're going to do some SP relative addressing below
2732   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2733 
2734   // Set an oopmap for the call site
2735   // Use the same PC we used for the last java frame
2736   oop_maps->add_gc_map(the_pc - start,
2737                        new OopMap( frame_size_in_words, 0 ));
2738 
2739   // Clear fp AND pc
2740   __ reset_last_Java_frame(true);
2741 
2742   // Collect return values
2743   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2744   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2745   // I think this is useless (throwing pc?)
2746   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2747 
2748   // Pop self-frame.
2749   __ leave();                           // Epilog
2750 
2751   // Jump to interpreter
2752   __ ret(0);
2753 
2754   // Make sure all code is generated
2755   masm->flush();
2756 
2757   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2758   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2759 #if INCLUDE_JVMCI
2760   if (EnableJVMCI) {
2761     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2762     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2763   }
2764 #endif
2765 }
2766 
2767 #ifdef COMPILER2
2768 //------------------------------generate_uncommon_trap_blob--------------------
2769 void SharedRuntime::generate_uncommon_trap_blob() {
2770   // Allocate space for the code
2771   ResourceMark rm;
2772   // Setup code generation tools
2773   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2774   MacroAssembler* masm = new MacroAssembler(&buffer);
2775 
2776   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2777 
2778   address start = __ pc();
2779 
2780   if (UseRTMLocking) {
2781     // Abort RTM transaction before possible nmethod deoptimization.
2782     __ xabort(0);
2783   }
2784 
2785   // Push self-frame.  We get here with a return address on the
2786   // stack, so rsp is 8-byte aligned until we allocate our frame.
2787   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2788 
2789   // No callee saved registers. rbp is assumed implicitly saved
2790   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2791 
2792   // compiler left unloaded_class_index in j_rarg0 move to where the
2793   // runtime expects it.
2794   __ movl(c_rarg1, j_rarg0);
2795 
2796   __ set_last_Java_frame(noreg, noreg, NULL);
2797 
2798   // Call C code.  Need thread but NOT official VM entry
2799   // crud.  We cannot block on this call, no GC can happen.  Call should
2800   // capture callee-saved registers as well as return values.
2801   // Thread is in rdi already.
2802   //
2803   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2804 
2805   __ mov(c_rarg0, r15_thread);
2806   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2807   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2808 
2809   // Set an oopmap for the call site
2810   OopMapSet* oop_maps = new OopMapSet();
2811   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2812 
2813   // location of rbp is known implicitly by the frame sender code
2814 
2815   oop_maps->add_gc_map(__ pc() - start, map);
2816 
2817   __ reset_last_Java_frame(false);
2818 
2819   // Load UnrollBlock* into rdi
2820   __ mov(rdi, rax);
2821 
2822 #ifdef ASSERT
2823   { Label L;
2824     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2825             (int32_t)Deoptimization::Unpack_uncommon_trap);
2826     __ jcc(Assembler::equal, L);
2827     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2828     __ bind(L);
2829   }
2830 #endif
2831 
2832   // Pop all the frames we must move/replace.
2833   //
2834   // Frame picture (youngest to oldest)
2835   // 1: self-frame (no frame link)
2836   // 2: deopting frame  (no frame link)
2837   // 3: caller of deopting frame (could be compiled/interpreted).
2838 
2839   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2840   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2841 
2842   // Pop deoptimized frame (int)
2843   __ movl(rcx, Address(rdi,
2844                        Deoptimization::UnrollBlock::
2845                        size_of_deoptimized_frame_offset_in_bytes()));
2846   __ addptr(rsp, rcx);
2847 
2848   // rsp should be pointing at the return address to the caller (3)
2849 
2850   // Pick up the initial fp we should save
2851   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2852   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2853 
2854 #ifdef ASSERT
2855   // Compilers generate code that bang the stack by as much as the
2856   // interpreter would need. So this stack banging should never
2857   // trigger a fault. Verify that it does not on non product builds.
2858   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2859   __ bang_stack_size(rbx, rcx);
2860 #endif
2861 
2862   // Load address of array of frame pcs into rcx (address*)
2863   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2864 
2865   // Trash the return pc
2866   __ addptr(rsp, wordSize);
2867 
2868   // Load address of array of frame sizes into rsi (intptr_t*)
2869   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2870 
2871   // Counter
2872   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2873 
2874   // Now adjust the caller's stack to make up for the extra locals but
2875   // record the original sp so that we can save it in the skeletal
2876   // interpreter frame and the stack walking of interpreter_sender
2877   // will get the unextended sp value and not the "real" sp value.
2878 
2879   const Register sender_sp = r8;
2880 
2881   __ mov(sender_sp, rsp);
2882   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2883   __ subptr(rsp, rbx);
2884 
2885   // Push interpreter frames in a loop
2886   Label loop;
2887   __ bind(loop);
2888   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2889   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2890   __ pushptr(Address(rcx, 0));     // Save return address
2891   __ enter();                      // Save old & set new rbp
2892   __ subptr(rsp, rbx);             // Prolog
2893   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2894             sender_sp);            // Make it walkable
2895   // This value is corrected by layout_activation_impl
2896   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2897   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
2898   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
2899   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
2900   __ decrementl(rdx);              // Decrement counter
2901   __ jcc(Assembler::notZero, loop);
2902   __ pushptr(Address(rcx, 0));     // Save final return address
2903 
2904   // Re-push self-frame
2905   __ enter();                 // Save old & set new rbp
2906   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2907                               // Prolog
2908 
2909   // Use rbp because the frames look interpreted now
2910   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2911   // Don't need the precise return PC here, just precise enough to point into this code blob.
2912   address the_pc = __ pc();
2913   __ set_last_Java_frame(noreg, rbp, the_pc);
2914 
2915   // Call C code.  Need thread but NOT official VM entry
2916   // crud.  We cannot block on this call, no GC can happen.  Call should
2917   // restore return values to their stack-slots with the new SP.
2918   // Thread is in rdi already.
2919   //
2920   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2921 
2922   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2923   __ mov(c_rarg0, r15_thread);
2924   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2925   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2926 
2927   // Set an oopmap for the call site
2928   // Use the same PC we used for the last java frame
2929   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2930 
2931   // Clear fp AND pc
2932   __ reset_last_Java_frame(true);
2933 
2934   // Pop self-frame.
2935   __ leave();                 // Epilog
2936 
2937   // Jump to interpreter
2938   __ ret(0);
2939 
2940   // Make sure all code is generated
2941   masm->flush();
2942 
2943   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
2944                                                  SimpleRuntimeFrame::framesize >> 1);
2945 }
2946 #endif // COMPILER2
2947 
2948 //------------------------------generate_handler_blob------
2949 //
2950 // Generate a special Compile2Runtime blob that saves all registers,
2951 // and setup oopmap.
2952 //
2953 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
2954   assert(StubRoutines::forward_exception_entry() != NULL,
2955          "must be generated before");
2956 
2957   ResourceMark rm;
2958   OopMapSet *oop_maps = new OopMapSet();
2959   OopMap* map;
2960 
2961   // Allocate space for the code.  Setup code generation tools.
2962   CodeBuffer buffer("handler_blob", 2048, 1024);
2963   MacroAssembler* masm = new MacroAssembler(&buffer);
2964 
2965   address start   = __ pc();
2966   address call_pc = NULL;
2967   int frame_size_in_words;
2968   bool cause_return = (poll_type == POLL_AT_RETURN);
2969   bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
2970 
2971   if (UseRTMLocking) {
2972     // Abort RTM transaction before calling runtime
2973     // because critical section will be large and will be
2974     // aborted anyway. Also nmethod could be deoptimized.
2975     __ xabort(0);
2976   }
2977 
2978   // Make room for return address (or push it again)
2979   if (!cause_return) {
2980     __ push(rbx);
2981   }
2982 
2983   // Save registers, fpu state, and flags
2984   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
2985 
2986   // The following is basically a call_VM.  However, we need the precise
2987   // address of the call in order to generate an oopmap. Hence, we do all the
2988   // work outselves.
2989 
2990   __ set_last_Java_frame(noreg, noreg, NULL);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
2991 
2992   // The return address must always be correct so that frame constructor never
2993   // sees an invalid pc.
2994 
2995   if (!cause_return) {
2996     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2997     // Additionally, rbx is a callee saved register and we can look at it later to determine
2998     // if someone changed the return address for us!
2999     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3000     __ movptr(Address(rbp, wordSize), rbx);
3001   }
3002 
3003   // Do the call
3004   __ mov(c_rarg0, r15_thread);
3005   __ call(RuntimeAddress(call_ptr));
3006 
3007   // Set an oopmap for the call site.  This oopmap will map all
3008   // oop-registers and debug-info registers as callee-saved.  This
3009   // will allow deoptimization at this safepoint to find all possible
3010   // debug-info recordings, as well as let GC find all oops.
3011 
3012   oop_maps->add_gc_map( __ pc() - start, map);
3013 
3014   Label noException;
3015 
3016   __ reset_last_Java_frame(false);
3017 
3018   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3019   __ jcc(Assembler::equal, noException);
3020 
3021   // Exception pending
3022 
3023   RegisterSaver::restore_live_registers(masm, save_vectors);
3024 
3025   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3026 
3027   // No exception case
3028   __ bind(noException);
3029 
3030   Label no_adjust;
3031 #ifdef ASSERT
3032   Label bail;
3033 #endif
3034   if (!cause_return) {
3035     Label no_prefix, not_special;
3036 
3037     // If our stashed return pc was modified by the runtime we avoid touching it
3038     __ cmpptr(rbx, Address(rbp, wordSize));
3039     __ jccb(Assembler::notEqual, no_adjust);
3040 
3041     // Skip over the poll instruction.
3042     // See NativeInstruction::is_safepoint_poll()
3043     // Possible encodings:
3044     //      85 00       test   %eax,(%rax)
3045     //      85 01       test   %eax,(%rcx)
3046     //      85 02       test   %eax,(%rdx)
3047     //      85 03       test   %eax,(%rbx)
3048     //      85 06       test   %eax,(%rsi)
3049     //      85 07       test   %eax,(%rdi)
3050     //
3051     //   41 85 00       test   %eax,(%r8)
3052     //   41 85 01       test   %eax,(%r9)
3053     //   41 85 02       test   %eax,(%r10)
3054     //   41 85 03       test   %eax,(%r11)
3055     //   41 85 06       test   %eax,(%r14)
3056     //   41 85 07       test   %eax,(%r15)
3057     //
3058     //      85 04 24    test   %eax,(%rsp)
3059     //   41 85 04 24    test   %eax,(%r12)
3060     //      85 45 00    test   %eax,0x0(%rbp)
3061     //   41 85 45 00    test   %eax,0x0(%r13)
3062 
3063     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3064     __ jcc(Assembler::notEqual, no_prefix);
3065     __ addptr(rbx, 1);
3066     __ bind(no_prefix);
3067 #ifdef ASSERT
3068     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3069 #endif
3070     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3071     // r12/rsp 0x04
3072     // r13/rbp 0x05
3073     __ movzbq(rcx, Address(rbx, 1));
3074     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3075     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3076     __ cmpptr(rcx, 1);
3077     __ jcc(Assembler::above, not_special);
3078     __ addptr(rbx, 1);
3079     __ bind(not_special);
3080 #ifdef ASSERT
3081     // Verify the correct encoding of the poll we're about to skip.
3082     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3083     __ jcc(Assembler::notEqual, bail);
3084     // Mask out the modrm bits
3085     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3086     // rax encodes to 0, so if the bits are nonzero it's incorrect
3087     __ jcc(Assembler::notZero, bail);
3088 #endif
3089     // Adjust return pc forward to step over the safepoint poll instruction
3090     __ addptr(rbx, 2);
3091     __ movptr(Address(rbp, wordSize), rbx);
3092   }
3093 
3094   __ bind(no_adjust);
3095   // Normal exit, restore registers and exit.
3096   RegisterSaver::restore_live_registers(masm, save_vectors);
3097   __ ret(0);
3098 
3099 #ifdef ASSERT
3100   __ bind(bail);
3101   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3102 #endif
3103 
3104   // Make sure all code is generated
3105   masm->flush();
3106 
3107   // Fill-out other meta info
3108   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3109 }
3110 
3111 //
3112 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3113 //
3114 // Generate a stub that calls into vm to find out the proper destination
3115 // of a java call. All the argument registers are live at this point
3116 // but since this is generic code we don't know what they are and the caller
3117 // must do any gc of the args.
3118 //
3119 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3120   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3121 
3122   // allocate space for the code
3123   ResourceMark rm;
3124 
3125   CodeBuffer buffer(name, 1000, 512);
3126   MacroAssembler* masm                = new MacroAssembler(&buffer);
3127 
3128   int frame_size_in_words;
3129 
3130   OopMapSet *oop_maps = new OopMapSet();
3131   OopMap* map = NULL;
3132 
3133   int start = __ offset();
3134 
3135   // No need to save vector registers since they are caller-saved anyway.
3136   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
3137 
3138   // __ stop_if_in_cont(r10, "CONT 3");
3139 
3140   int frame_complete = __ offset();
3141 
3142   __ set_last_Java_frame(noreg, noreg, NULL);
3143 
3144   __ mov(c_rarg0, r15_thread);
3145 
3146   __ call(RuntimeAddress(destination));
3147 
3148 
3149   // Set an oopmap for the call site.
3150   // We need this not only for callee-saved registers, but also for volatile
3151   // registers that the compiler might be keeping live across a safepoint.
3152 
3153   oop_maps->add_gc_map( __ offset() - start, map);
3154 
3155   // rax contains the address we are going to jump to assuming no exception got installed
3156 
3157   // clear last_Java_sp
3158   __ reset_last_Java_frame(false);
3159   // check for pending exceptions
3160   Label pending;
3161   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3162   __ jcc(Assembler::notEqual, pending);
3163 
3164   // get the returned Method*
3165   __ get_vm_result_2(rbx, r15_thread);
3166   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3167 
3168   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3169 
3170   RegisterSaver::restore_live_registers(masm);
3171 
3172   // We are back the the original state on entry and ready to go.
3173 
3174   __ jmp(rax);
3175 
3176   // Pending exception after the safepoint
3177 
3178   __ bind(pending);
3179 
3180   RegisterSaver::restore_live_registers(masm);
3181 
3182   // exception pending => remove activation and forward to exception handler
3183 
3184   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3185 
3186   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3187   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3188 
3189   // -------------
3190   // make sure all code is generated
3191   masm->flush();
3192 
3193   // return the  blob
3194   // frame_size_words or bytes??
3195   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3196 }
3197 
3198 #ifdef COMPILER2
3199 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3200 
3201 class NativeInvokerGenerator : public StubCodeGenerator {
3202   address _call_target;
3203   int _shadow_space_bytes;
3204 
3205   const GrowableArray<VMReg>& _input_registers;
3206   const GrowableArray<VMReg>& _output_registers;
3207 
3208   int _frame_complete;
3209   int _framesize;
3210   OopMapSet* _oop_maps;
3211 public:
3212   NativeInvokerGenerator(CodeBuffer* buffer,
3213                          address call_target,
3214                          int shadow_space_bytes,
3215                          const GrowableArray<VMReg>& input_registers,
3216                          const GrowableArray<VMReg>& output_registers)
3217    : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3218      _call_target(call_target),
3219      _shadow_space_bytes(shadow_space_bytes),
3220      _input_registers(input_registers),
3221      _output_registers(output_registers),
3222      _frame_complete(0),
3223      _framesize(0),
3224      _oop_maps(NULL) {
3225     assert(_output_registers.length() <= 1
3226            || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3227 
3228   }
3229 
3230   void generate();
3231 
3232   int spill_size_in_bytes() const {
3233     if (_output_registers.length() == 0) {
3234       return 0;
3235     }
3236     VMReg reg = _output_registers.at(0);
3237     assert(reg->is_reg(), "must be a register");
3238     if (reg->is_Register()) {
3239       return 8;
3240     } else if (reg->is_XMMRegister()) {
3241       if (UseAVX >= 3) {
3242         return 64;
3243       } else if (UseAVX >= 1) {
3244         return 32;
3245       } else {
3246         return 16;
3247       }
3248     } else {
3249       ShouldNotReachHere();
3250     }
3251     return 0;
3252   }
3253 
3254   void spill_out_registers() {
3255     if (_output_registers.length() == 0) {
3256       return;
3257     }
3258     VMReg reg = _output_registers.at(0);
3259     assert(reg->is_reg(), "must be a register");
3260     MacroAssembler* masm = _masm;
3261     if (reg->is_Register()) {
3262       __ movptr(Address(rsp, 0), reg->as_Register());
3263     } else if (reg->is_XMMRegister()) {
3264       if (UseAVX >= 3) {
3265         __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3266       } else if (UseAVX >= 1) {
3267         __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3268       } else {
3269         __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3270       }
3271     } else {
3272       ShouldNotReachHere();
3273     }
3274   }
3275 
3276   void fill_out_registers() {
3277     if (_output_registers.length() == 0) {
3278       return;
3279     }
3280     VMReg reg = _output_registers.at(0);
3281     assert(reg->is_reg(), "must be a register");
3282     MacroAssembler* masm = _masm;
3283     if (reg->is_Register()) {
3284       __ movptr(reg->as_Register(), Address(rsp, 0));
3285     } else if (reg->is_XMMRegister()) {
3286       if (UseAVX >= 3) {
3287         __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3288       } else if (UseAVX >= 1) {
3289         __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3290       } else {
3291         __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3292       }
3293     } else {
3294       ShouldNotReachHere();
3295     }
3296   }
3297 
3298   int frame_complete() const {
3299     return _frame_complete;
3300   }
3301 
3302   int framesize() const {
3303     return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3304   }
3305 
3306   OopMapSet* oop_maps() const {
3307     return _oop_maps;
3308   }
3309 
3310 private:
3311 #ifdef ASSERT
3312 bool target_uses_register(VMReg reg) {
3313   return _input_registers.contains(reg) || _output_registers.contains(reg);
3314 }
3315 #endif
3316 };
3317 
3318 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3319                                                 int shadow_space_bytes,
3320                                                 const GrowableArray<VMReg>& input_registers,
3321                                                 const GrowableArray<VMReg>& output_registers) {
3322   int locs_size  = 64;
3323   CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3324   NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3325   g.generate();
3326   code.log_section_sizes("nep_invoker_blob");
3327 
3328   RuntimeStub* stub =
3329     RuntimeStub::new_runtime_stub("nep_invoker_blob",
3330                                   &code,
3331                                   g.frame_complete(),
3332                                   g.framesize(),
3333                                   g.oop_maps(), false);
3334   return stub;
3335 }
3336 
3337 void NativeInvokerGenerator::generate() {
3338   assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3339 
3340   enum layout {
3341     rbp_off,
3342     rbp_off2,
3343     return_off,
3344     return_off2,
3345     framesize // inclusive of return address
3346   };
3347 
3348   _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3349   assert(is_even(_framesize/2), "sp not 16-byte aligned");
3350 
3351   _oop_maps  = new OopMapSet();
3352   MacroAssembler* masm = _masm;
3353 
3354   address start = __ pc();
3355 
3356   __ enter();
3357 
3358   // return address and rbp are already in place
3359   __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3360 
3361   _frame_complete = __ pc() - start;
3362 
3363   address the_pc = __ pc();
3364 
3365   __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3366   OopMap* map = new OopMap(_framesize, 0);
3367   _oop_maps->add_gc_map(the_pc - start, map);
3368 
3369   // State transition
3370   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3371 
3372   __ call(RuntimeAddress(_call_target));
3373 
3374   __ restore_cpu_control_state_after_jni();
3375 
3376   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3377 
3378   // Force this write out before the read below
3379   __ membar(Assembler::Membar_mask_bits(
3380           Assembler::LoadLoad | Assembler::LoadStore |
3381           Assembler::StoreLoad | Assembler::StoreStore));
3382 
3383   Label L_after_safepoint_poll;
3384   Label L_safepoint_poll_slow_path;
3385 
3386   __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3387   __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3388   __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3389 
3390   __ bind(L_after_safepoint_poll);
3391 
3392   // change thread state
3393   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3394 
3395   __ block_comment("reguard stack check");
3396   Label L_reguard;
3397   Label L_after_reguard;
3398   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3399   __ jcc(Assembler::equal, L_reguard);
3400   __ bind(L_after_reguard);
3401 
3402   __ reset_last_Java_frame(r15_thread, true);
3403 
3404   __ leave(); // required for proper stackwalking of RuntimeStub frame
3405   __ ret(0);
3406 
3407   //////////////////////////////////////////////////////////////////////////////
3408 
3409   __ block_comment("{ L_safepoint_poll_slow_path");
3410   __ bind(L_safepoint_poll_slow_path);
3411   __ vzeroupper();
3412 
3413   spill_out_registers();
3414 
3415   __ mov(c_rarg0, r15_thread);
3416   __ mov(r12, rsp); // remember sp
3417   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3418   __ andptr(rsp, -16); // align stack as required by ABI
3419   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3420   __ mov(rsp, r12); // restore sp
3421   __ reinit_heapbase();
3422 
3423   fill_out_registers();
3424 
3425   __ jmp(L_after_safepoint_poll);
3426   __ block_comment("} L_safepoint_poll_slow_path");
3427 
3428   //////////////////////////////////////////////////////////////////////////////
3429 
3430   __ block_comment("{ L_reguard");
3431   __ bind(L_reguard);
3432   __ vzeroupper();
3433 
3434   spill_out_registers();
3435 
3436   __ mov(r12, rsp); // remember sp
3437   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3438   __ andptr(rsp, -16); // align stack as required by ABI
3439   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3440   __ mov(rsp, r12); // restore sp
3441   __ reinit_heapbase();
3442 
3443   fill_out_registers();
3444 
3445   __ jmp(L_after_reguard);
3446 
3447   __ block_comment("} L_reguard");
3448 
3449   //////////////////////////////////////////////////////////////////////////////
3450 
3451   __ flush();
3452 }
3453 #endif // COMPILER2
3454 
3455 //------------------------------Montgomery multiplication------------------------
3456 //
3457 
3458 #ifndef _WINDOWS
3459 
3460 // Subtract 0:b from carry:a.  Return carry.
3461 static julong
3462 sub(julong a[], julong b[], julong carry, long len) {
3463   long long i = 0, cnt = len;
3464   julong tmp;
3465   asm volatile("clc; "
3466                "0: ; "
3467                "mov (%[b], %[i], 8), %[tmp]; "
3468                "sbb %[tmp], (%[a], %[i], 8); "
3469                "inc %[i]; dec %[cnt]; "
3470                "jne 0b; "
3471                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3472                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3473                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3474                : "memory");
3475   return tmp;
3476 }
3477 
3478 // Multiply (unsigned) Long A by Long B, accumulating the double-
3479 // length result into the accumulator formed of T0, T1, and T2.
3480 #define MACC(A, B, T0, T1, T2)                                  \
3481 do {                                                            \
3482   unsigned long hi, lo;                                         \
3483   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3484            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3485            : "r"(A), "a"(B) : "cc");                            \
3486  } while(0)
3487 
3488 // As above, but add twice the double-length result into the
3489 // accumulator.
3490 #define MACC2(A, B, T0, T1, T2)                                 \
3491 do {                                                            \
3492   unsigned long hi, lo;                                         \
3493   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3494            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3495            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3496            : "r"(A), "a"(B) : "cc");                            \
3497  } while(0)
3498 
3499 #else //_WINDOWS
3500 
3501 static julong
3502 sub(julong a[], julong b[], julong carry, long len) {
3503   long i;
3504   julong tmp;
3505   unsigned char c = 1;
3506   for (i = 0; i < len; i++) {
3507     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3508     a[i] = tmp;
3509   }
3510   c = _addcarry_u64(c, carry, ~0, &tmp);
3511   return tmp;
3512 }
3513 
3514 // Multiply (unsigned) Long A by Long B, accumulating the double-
3515 // length result into the accumulator formed of T0, T1, and T2.
3516 #define MACC(A, B, T0, T1, T2)                          \
3517 do {                                                    \
3518   julong hi, lo;                            \
3519   lo = _umul128(A, B, &hi);                             \
3520   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3521   c = _addcarry_u64(c, hi, T1, &T1);                    \
3522   _addcarry_u64(c, T2, 0, &T2);                         \
3523  } while(0)
3524 
3525 // As above, but add twice the double-length result into the
3526 // accumulator.
3527 #define MACC2(A, B, T0, T1, T2)                         \
3528 do {                                                    \
3529   julong hi, lo;                            \
3530   lo = _umul128(A, B, &hi);                             \
3531   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3532   c = _addcarry_u64(c, hi, T1, &T1);                    \
3533   _addcarry_u64(c, T2, 0, &T2);                         \
3534   c = _addcarry_u64(0, lo, T0, &T0);                    \
3535   c = _addcarry_u64(c, hi, T1, &T1);                    \
3536   _addcarry_u64(c, T2, 0, &T2);                         \
3537  } while(0)
3538 
3539 #endif //_WINDOWS
3540 
3541 // Fast Montgomery multiplication.  The derivation of the algorithm is
3542 // in  A Cryptographic Library for the Motorola DSP56000,
3543 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3544 
3545 static void NOINLINE
3546 montgomery_multiply(julong a[], julong b[], julong n[],
3547                     julong m[], julong inv, int len) {
3548   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3549   int i;
3550 
3551   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3552 
3553   for (i = 0; i < len; i++) {
3554     int j;
3555     for (j = 0; j < i; j++) {
3556       MACC(a[j], b[i-j], t0, t1, t2);
3557       MACC(m[j], n[i-j], t0, t1, t2);
3558     }
3559     MACC(a[i], b[0], t0, t1, t2);
3560     m[i] = t0 * inv;
3561     MACC(m[i], n[0], t0, t1, t2);
3562 
3563     assert(t0 == 0, "broken Montgomery multiply");
3564 
3565     t0 = t1; t1 = t2; t2 = 0;
3566   }
3567 
3568   for (i = len; i < 2*len; i++) {
3569     int j;
3570     for (j = i-len+1; j < len; j++) {
3571       MACC(a[j], b[i-j], t0, t1, t2);
3572       MACC(m[j], n[i-j], t0, t1, t2);
3573     }
3574     m[i-len] = t0;
3575     t0 = t1; t1 = t2; t2 = 0;
3576   }
3577 
3578   while (t0)
3579     t0 = sub(m, n, t0, len);
3580 }
3581 
3582 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3583 // multiplies so it should be up to 25% faster than Montgomery
3584 // multiplication.  However, its loop control is more complex and it
3585 // may actually run slower on some machines.
3586 
3587 static void NOINLINE
3588 montgomery_square(julong a[], julong n[],
3589                   julong m[], julong inv, int len) {
3590   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3591   int i;
3592 
3593   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3594 
3595   for (i = 0; i < len; i++) {
3596     int j;
3597     int end = (i+1)/2;
3598     for (j = 0; j < end; j++) {
3599       MACC2(a[j], a[i-j], t0, t1, t2);
3600       MACC(m[j], n[i-j], t0, t1, t2);
3601     }
3602     if ((i & 1) == 0) {
3603       MACC(a[j], a[j], t0, t1, t2);
3604     }
3605     for (; j < i; j++) {
3606       MACC(m[j], n[i-j], t0, t1, t2);
3607     }
3608     m[i] = t0 * inv;
3609     MACC(m[i], n[0], t0, t1, t2);
3610 
3611     assert(t0 == 0, "broken Montgomery square");
3612 
3613     t0 = t1; t1 = t2; t2 = 0;
3614   }
3615 
3616   for (i = len; i < 2*len; i++) {
3617     int start = i-len+1;
3618     int end = start + (len - start)/2;
3619     int j;
3620     for (j = start; j < end; j++) {
3621       MACC2(a[j], a[i-j], t0, t1, t2);
3622       MACC(m[j], n[i-j], t0, t1, t2);
3623     }
3624     if ((i & 1) == 0) {
3625       MACC(a[j], a[j], t0, t1, t2);
3626     }
3627     for (; j < len; j++) {
3628       MACC(m[j], n[i-j], t0, t1, t2);
3629     }
3630     m[i-len] = t0;
3631     t0 = t1; t1 = t2; t2 = 0;
3632   }
3633 
3634   while (t0)
3635     t0 = sub(m, n, t0, len);
3636 }
3637 
3638 // Swap words in a longword.
3639 static julong swap(julong x) {
3640   return (x << 32) | (x >> 32);
3641 }
3642 
3643 // Copy len longwords from s to d, word-swapping as we go.  The
3644 // destination array is reversed.
3645 static void reverse_words(julong *s, julong *d, int len) {
3646   d += len;
3647   while(len-- > 0) {
3648     d--;
3649     *d = swap(*s);
3650     s++;
3651   }
3652 }
3653 
3654 // The threshold at which squaring is advantageous was determined
3655 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3656 #define MONTGOMERY_SQUARING_THRESHOLD 64
3657 
3658 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3659                                         jint len, jlong inv,
3660                                         jint *m_ints) {
3661   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3662   int longwords = len/2;
3663 
3664   // Make very sure we don't use so much space that the stack might
3665   // overflow.  512 jints corresponds to an 16384-bit integer and
3666   // will use here a total of 8k bytes of stack space.
3667   int total_allocation = longwords * sizeof (julong) * 4;
3668   guarantee(total_allocation <= 8192, "must be");
3669   julong *scratch = (julong *)alloca(total_allocation);
3670 
3671   // Local scratch arrays
3672   julong
3673     *a = scratch + 0 * longwords,
3674     *b = scratch + 1 * longwords,
3675     *n = scratch + 2 * longwords,
3676     *m = scratch + 3 * longwords;
3677 
3678   reverse_words((julong *)a_ints, a, longwords);
3679   reverse_words((julong *)b_ints, b, longwords);
3680   reverse_words((julong *)n_ints, n, longwords);
3681 
3682   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3683 
3684   reverse_words(m, (julong *)m_ints, longwords);
3685 }
3686 
3687 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3688                                       jint len, jlong inv,
3689                                       jint *m_ints) {
3690   assert(len % 2 == 0, "array length in montgomery_square must be even");
3691   int longwords = len/2;
3692 
3693   // Make very sure we don't use so much space that the stack might
3694   // overflow.  512 jints corresponds to an 16384-bit integer and
3695   // will use here a total of 6k bytes of stack space.
3696   int total_allocation = longwords * sizeof (julong) * 3;
3697   guarantee(total_allocation <= 8192, "must be");
3698   julong *scratch = (julong *)alloca(total_allocation);
3699 
3700   // Local scratch arrays
3701   julong
3702     *a = scratch + 0 * longwords,
3703     *n = scratch + 1 * longwords,
3704     *m = scratch + 2 * longwords;
3705 
3706   reverse_words((julong *)a_ints, a, longwords);
3707   reverse_words((julong *)n_ints, n, longwords);
3708 
3709   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3710     ::montgomery_square(a, n, m, (julong)inv, longwords);
3711   } else {
3712     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3713   }
3714 
3715   reverse_words(m, (julong *)m_ints, longwords);
3716 }
3717 
3718 #ifdef COMPILER2
3719 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3720 //
3721 //------------------------------generate_exception_blob---------------------------
3722 // creates exception blob at the end
3723 // Using exception blob, this code is jumped from a compiled method.
3724 // (see emit_exception_handler in x86_64.ad file)
3725 //
3726 // Given an exception pc at a call we call into the runtime for the
3727 // handler in this method. This handler might merely restore state
3728 // (i.e. callee save registers) unwind the frame and jump to the
3729 // exception handler for the nmethod if there is no Java level handler
3730 // for the nmethod.
3731 //
3732 // This code is entered with a jmp.
3733 //
3734 // Arguments:
3735 //   rax: exception oop
3736 //   rdx: exception pc
3737 //
3738 // Results:
3739 //   rax: exception oop
3740 //   rdx: exception pc in caller or ???
3741 //   destination: exception handler of caller
3742 //
3743 // Note: the exception pc MUST be at a call (precise debug information)
3744 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3745 //
3746 
3747 void OptoRuntime::generate_exception_blob() {
3748   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3749   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3750   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3751 
3752   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3753 
3754   // Allocate space for the code
3755   ResourceMark rm;
3756   // Setup code generation tools
3757   CodeBuffer buffer("exception_blob", 2048, 1024);
3758   MacroAssembler* masm = new MacroAssembler(&buffer);
3759 
3760 
3761   address start = __ pc();
3762 
3763   // Exception pc is 'return address' for stack walker
3764   __ push(rdx);
3765   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3766 
3767   // Save callee-saved registers.  See x86_64.ad.
3768 
3769   // rbp is an implicitly saved callee saved register (i.e., the calling
3770   // convention will save/restore it in the prolog/epilog). Other than that
3771   // there are no callee save registers now that adapter frames are gone.
3772 
3773   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3774 
3775   // Store exception in Thread object. We cannot pass any arguments to the
3776   // handle_exception call, since we do not want to make any assumption
3777   // about the size of the frame where the exception happened in.
3778   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3779   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3780   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3781 
3782   // This call does all the hard work.  It checks if an exception handler
3783   // exists in the method.
3784   // If so, it returns the handler address.
3785   // If not, it prepares for stack-unwinding, restoring the callee-save
3786   // registers of the frame being removed.
3787   //
3788   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3789 
3790   // At a method handle call, the stack may not be properly aligned
3791   // when returning with an exception.
3792   address the_pc = __ pc();
3793   __ set_last_Java_frame(noreg, noreg, the_pc);
3794   __ mov(c_rarg0, r15_thread);
3795   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3796   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3797 
3798   // Set an oopmap for the call site.  This oopmap will only be used if we
3799   // are unwinding the stack.  Hence, all locations will be dead.
3800   // Callee-saved registers will be the same as the frame above (i.e.,
3801   // handle_exception_stub), since they were restored when we got the
3802   // exception.
3803 
3804   OopMapSet* oop_maps = new OopMapSet();
3805 
3806   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3807 
3808   __ reset_last_Java_frame(false);
3809 
3810   // Restore callee-saved registers
3811 
3812   // rbp is an implicitly saved callee-saved register (i.e., the calling
3813   // convention will save restore it in prolog/epilog) Other than that
3814   // there are no callee save registers now that adapter frames are gone.
3815 
3816   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3817 
3818   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3819   __ pop(rdx);                  // No need for exception pc anymore
3820 
3821   // rax: exception handler
3822 
3823   // We have a handler in rax (could be deopt blob).
3824   __ mov(r8, rax);
3825 
3826   // Get the exception oop
3827   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3828   // Get the exception pc in case we are deoptimized
3829   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3830 #ifdef ASSERT
3831   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3832   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3833 #endif
3834   // Clear the exception oop so GC no longer processes it as a root.
3835   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3836 
3837   // rax: exception oop
3838   // r8:  exception handler
3839   // rdx: exception pc
3840   // Jump to handler
3841 
3842   __ jmp(r8);
3843 
3844   // Make sure all code is generated
3845   masm->flush();
3846 
3847   // Set exception blob
3848   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3849 }
3850 #endif // COMPILER2
3851 
3852 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
3853                                        int total_in_args, const VMRegPair* in_regs,
3854                                        int total_out_args, VMRegPair* out_regs,
3855                                        GrowableArray<int>& arg_order,
3856                                        VMRegPair tmp_vmreg) {
3857   ComputeMoveOrder order(total_in_args, in_regs,
3858                          total_out_args, out_regs,
3859                          in_sig_bt, arg_order, tmp_vmreg);
3860 }
--- EOF ---