1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"

  31 #include "code/debugInfoRec.hpp"
  32 #include "code/icBuffer.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/compiledICHolder.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/jniHandles.hpp"
  48 #include "runtime/safepointMechanism.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/signature.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "runtime/vframeArray.hpp"
  53 #include "runtime/vm_version.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/formatBuffer.hpp"
  56 #include "vmreg_x86.inline.hpp"
  57 #ifdef COMPILER1
  58 #include "c1/c1_Runtime1.hpp"
  59 #endif
  60 #ifdef COMPILER2
  61 #include "opto/runtime.hpp"
  62 #endif
  63 #if INCLUDE_JVMCI
  64 #include "jvmci/jvmciJavaClasses.hpp"
  65 #endif
  66 
  67 #define __ masm->
  68 
  69 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  70 
  71 class SimpleRuntimeFrame {
  72 
  73   public:
  74 
  75   // Most of the runtime stubs have this simple frame layout.
  76   // This class exists to make the layout shared in one place.
  77   // Offsets are for compiler stack slots, which are jints.
  78   enum layout {
  79     // The frame sender code expects that rbp will be in the "natural" place and
  80     // will override any oopMap setting for it. We must therefore force the layout
  81     // so that it agrees with the frame sender code.
  82     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  83     rbp_off2,
  84     return_off, return_off2,
  85     framesize
  86   };
  87 };
  88 
  89 class RegisterSaver {
  90   // Capture info about frame layout.  Layout offsets are in jint
  91   // units because compiler frame slots are jints.
  92 #define XSAVE_AREA_BEGIN 160
  93 #define XSAVE_AREA_YMM_BEGIN 576
  94 #define XSAVE_AREA_OPMASK_BEGIN 1088
  95 #define XSAVE_AREA_ZMM_BEGIN 1152
  96 #define XSAVE_AREA_UPPERBANK 1664
  97 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  98 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  99 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 100 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 101 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 102   enum layout {
 103     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 104     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 105     DEF_XMM_OFFS(0),
 106     DEF_XMM_OFFS(1),
 107     // 2..15 are implied in range usage
 108     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 109     DEF_YMM_OFFS(0),
 110     DEF_YMM_OFFS(1),
 111     // 2..15 are implied in range usage
 112     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 113     DEF_OPMASK_OFFS(0),
 114     DEF_OPMASK_OFFS(1),
 115     // 2..7 are implied in range usage
 116     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 117     DEF_ZMM_OFFS(0),
 118     DEF_ZMM_OFFS(1),
 119     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 120     DEF_ZMM_UPPER_OFFS(16),
 121     DEF_ZMM_UPPER_OFFS(17),
 122     // 18..31 are implied in range usage
 123     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 124     fpu_stateH_end,
 125     r15_off, r15H_off,
 126     r14_off, r14H_off,
 127     r13_off, r13H_off,
 128     r12_off, r12H_off,
 129     r11_off, r11H_off,
 130     r10_off, r10H_off,
 131     r9_off,  r9H_off,
 132     r8_off,  r8H_off,
 133     rdi_off, rdiH_off,
 134     rsi_off, rsiH_off,
 135     ignore_off, ignoreH_off,  // extra copy of rbp
 136     rsp_off, rspH_off,
 137     rbx_off, rbxH_off,
 138     rdx_off, rdxH_off,
 139     rcx_off, rcxH_off,
 140     rax_off, raxH_off,
 141     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 142     align_off, alignH_off,
 143     flags_off, flagsH_off,
 144     // The frame sender code expects that rbp will be in the "natural" place and
 145     // will override any oopMap setting for it. We must therefore force the layout
 146     // so that it agrees with the frame sender code.
 147     rbp_off, rbpH_off,        // copy of rbp we will restore
 148     return_off, returnH_off,  // slot for return address
 149     reg_save_size             // size in compiler stack slots
 150   };
 151 
 152  public:
 153   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors);
 154   static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 155 
 156   // Offsets into the register save area
 157   // Used by deoptimization when it is managing result register
 158   // values on its own
 159 
 160   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 161   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 162   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 163   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 164   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 165 
 166   // During deoptimization only the result registers need to be restored,
 167   // all the other values have already been extracted.
 168   static void restore_result_registers(MacroAssembler* masm);
 169 };
 170 
 171 // Register is a class, but it would be assigned numerical value.
 172 // "0" is assigned for rax. Thus we need to ignore -Wnonnull.
 173 PRAGMA_DIAG_PUSH
 174 PRAGMA_NONNULL_IGNORED
 175 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
 176   int off = 0;
 177   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 178   if (UseAVX < 3) {
 179     num_xmm_regs = num_xmm_regs/2;
 180   }
 181 #if COMPILER2_OR_JVMCI
 182   if (save_vectors && UseAVX == 0) {
 183     save_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 184   }
 185   assert(!save_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 186 #else
 187   save_vectors = false; // vectors are generated only by C2 and JVMCI
 188 #endif
 189 
 190   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 191   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 192   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 193   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 194   // CodeBlob frame size is in words.
 195   int frame_size_in_words = frame_size_in_bytes / wordSize;
 196   *total_frame_words = frame_size_in_words;
 197 
 198   // Save registers, fpu state, and flags.
 199   // We assume caller has already pushed the return address onto the
 200   // stack, so rsp is 8-byte aligned here.
 201   // We push rpb twice in this sequence because we want the real rbp
 202   // to be under the return like a normal enter.
 203 
 204   __ enter();          // rsp becomes 16-byte aligned here
 205   __ push_CPU_state(); // Push a multiple of 16 bytes
 206 
 207   // push cpu state handles this on EVEX enabled targets
 208   if (save_vectors) {
 209     // Save upper half of YMM registers(0..15)
 210     int base_addr = XSAVE_AREA_YMM_BEGIN;
 211     for (int n = 0; n < 16; n++) {
 212       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 213     }
 214     if (VM_Version::supports_evex()) {
 215       // Save upper half of ZMM registers(0..15)
 216       base_addr = XSAVE_AREA_ZMM_BEGIN;
 217       for (int n = 0; n < 16; n++) {
 218         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 219       }
 220       // Save full ZMM registers(16..num_xmm_regs)
 221       base_addr = XSAVE_AREA_UPPERBANK;
 222       off = 0;
 223       int vector_len = Assembler::AVX_512bit;
 224       for (int n = 16; n < num_xmm_regs; n++) {
 225         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 226       }
 227 #if COMPILER2_OR_JVMCI
 228       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 229       off = 0;
 230       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 231         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 232       }
 233 #endif
 234     }
 235   } else {
 236     if (VM_Version::supports_evex()) {
 237       // Save upper bank of ZMM registers(16..31) for double/float usage
 238       int base_addr = XSAVE_AREA_UPPERBANK;
 239       off = 0;
 240       for (int n = 16; n < num_xmm_regs; n++) {
 241         __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
 242       }
 243 #if COMPILER2_OR_JVMCI
 244       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 245       off = 0;
 246       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 247         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 248       }
 249 #endif
 250     }
 251   }
 252   __ vzeroupper();
 253   if (frame::arg_reg_save_area_bytes != 0) {
 254     // Allocate argument register save area
 255     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 256   }
 257 
 258   // Set an oopmap for the call site.  This oopmap will map all
 259   // oop-registers and debug-info registers as callee-saved.  This
 260   // will allow deoptimization at this safepoint to find all possible
 261   // debug-info recordings, as well as let GC find all oops.
 262 
 263   OopMapSet *oop_maps = new OopMapSet();
 264   OopMap* map = new OopMap(frame_size_in_slots, 0);
 265 
 266 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 267 
 268   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 271   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 272   // rbp location is known implicitly by the frame sender code, needs no oopmap
 273   // and the location where rbp was saved by is ignored
 274   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 283   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 284   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 285   // on EVEX enabled targets, we get it included in the xsave area
 286   off = xmm0_off;
 287   int delta = xmm1_off - off;
 288   for (int n = 0; n < 16; n++) {
 289     XMMRegister xmm_name = as_XMMRegister(n);
 290     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 291     off += delta;
 292   }
 293   if (UseAVX > 2) {
 294     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 295     off = zmm16_off;
 296     delta = zmm17_off - off;
 297     for (int n = 16; n < num_xmm_regs; n++) {
 298       XMMRegister zmm_name = as_XMMRegister(n);
 299       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 300       off += delta;
 301     }
 302   }
 303 
 304 #if COMPILER2_OR_JVMCI
 305   if (save_vectors) {
 306     // Save upper half of YMM registers(0..15)
 307     off = ymm0_off;
 308     delta = ymm1_off - ymm0_off;
 309     for (int n = 0; n < 16; n++) {
 310       XMMRegister ymm_name = as_XMMRegister(n);
 311       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 312       off += delta;
 313     }
 314     if (VM_Version::supports_evex()) {
 315       // Save upper half of ZMM registers(0..15)
 316       off = zmm0_off;
 317       delta = zmm1_off - zmm0_off;
 318       for (int n = 0; n < 16; n++) {
 319         XMMRegister zmm_name = as_XMMRegister(n);
 320         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 321         off += delta;
 322       }
 323     }
 324   }
 325 #endif // COMPILER2_OR_JVMCI
 326 
 327   // %%% These should all be a waste but we'll keep things as they were for now
 328   if (true) {
 329     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 333     // rbp location is known implicitly by the frame sender code, needs no oopmap
 334     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 343     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 344     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 345     // on EVEX enabled targets, we get it included in the xsave area
 346     off = xmm0H_off;
 347     delta = xmm1H_off - off;
 348     for (int n = 0; n < 16; n++) {
 349       XMMRegister xmm_name = as_XMMRegister(n);
 350       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 351       off += delta;
 352     }
 353     if (UseAVX > 2) {
 354       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 355       off = zmm16H_off;
 356       delta = zmm17H_off - off;
 357       for (int n = 16; n < num_xmm_regs; n++) {
 358         XMMRegister zmm_name = as_XMMRegister(n);
 359         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 360         off += delta;
 361       }
 362     }
 363   }
 364 
 365   return map;
 366 }
 367 PRAGMA_DIAG_POP
 368 
 369 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
 370   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 371   if (UseAVX < 3) {
 372     num_xmm_regs = num_xmm_regs/2;
 373   }
 374   if (frame::arg_reg_save_area_bytes != 0) {
 375     // Pop arg register save area
 376     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 377   }
 378 
 379 #if COMPILER2_OR_JVMCI
 380   if (restore_vectors) {
 381     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 382     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 383   }
 384 #else
 385   assert(!restore_vectors, "vectors are generated only by C2");
 386 #endif
 387 
 388   __ vzeroupper();
 389 
 390   // On EVEX enabled targets everything is handled in pop fpu state
 391   if (restore_vectors) {
 392     // Restore upper half of YMM registers (0..15)
 393     int base_addr = XSAVE_AREA_YMM_BEGIN;
 394     for (int n = 0; n < 16; n++) {
 395       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 396     }
 397     if (VM_Version::supports_evex()) {
 398       // Restore upper half of ZMM registers (0..15)
 399       base_addr = XSAVE_AREA_ZMM_BEGIN;
 400       for (int n = 0; n < 16; n++) {
 401         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 402       }
 403       // Restore full ZMM registers(16..num_xmm_regs)
 404       base_addr = XSAVE_AREA_UPPERBANK;
 405       int vector_len = Assembler::AVX_512bit;
 406       int off = 0;
 407       for (int n = 16; n < num_xmm_regs; n++) {
 408         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 409       }
 410 #if COMPILER2_OR_JVMCI
 411       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 412       off = 0;
 413       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 414         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 415       }
 416 #endif
 417     }
 418   } else {
 419     if (VM_Version::supports_evex()) {
 420       // Restore upper bank of ZMM registers(16..31) for double/float usage
 421       int base_addr = XSAVE_AREA_UPPERBANK;
 422       int off = 0;
 423       for (int n = 16; n < num_xmm_regs; n++) {
 424         __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
 425       }
 426 #if COMPILER2_OR_JVMCI
 427       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 428       off = 0;
 429       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 430         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 431       }
 432 #endif
 433     }
 434   }
 435 
 436   // Recover CPU state
 437   __ pop_CPU_state();
 438   // Get the rbp described implicitly by the calling convention (no oopMap)
 439   __ pop(rbp);
 440 }
 441 
 442 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 443 
 444   // Just restore result register. Only used by deoptimization. By
 445   // now any callee save register that needs to be restored to a c2
 446   // caller of the deoptee has been extracted into the vframeArray
 447   // and will be stuffed into the c2i adapter we create for later
 448   // restoration so only result registers need to be restored here.
 449 
 450   // Restore fp result register
 451   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 452   // Restore integer result register
 453   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 454   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 455 
 456   // Pop all of the register save are off the stack except the return address
 457   __ addptr(rsp, return_offset_in_bytes());
 458 }
 459 
 460 // Is vector's size (in bytes) bigger than a size saved by default?
 461 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 462 bool SharedRuntime::is_wide_vector(int size) {
 463   return size > 16;
 464 }
 465 
 466 // ---------------------------------------------------------------------------
 467 // Read the array of BasicTypes from a signature, and compute where the
 468 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 469 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 470 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 471 // as framesizes are fixed.
 472 // VMRegImpl::stack0 refers to the first slot 0(sp).
 473 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 474 // up to RegisterImpl::number_of_registers) are the 64-bit
 475 // integer registers.
 476 
 477 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 478 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 479 // units regardless of build. Of course for i486 there is no 64 bit build
 480 
 481 // The Java calling convention is a "shifted" version of the C ABI.
 482 // By skipping the first C ABI register we can call non-static jni methods
 483 // with small numbers of arguments without having to shuffle the arguments
 484 // at all. Since we control the java ABI we ought to at least get some
 485 // advantage out of it.
 486 
 487 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 488                                            VMRegPair *regs,
 489                                            int total_args_passed) {
 490 
 491   // Create the mapping between argument positions and
 492   // registers.
 493   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 494     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 495   };
 496   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 497     j_farg0, j_farg1, j_farg2, j_farg3,
 498     j_farg4, j_farg5, j_farg6, j_farg7
 499   };
 500 
 501 
 502   uint int_args = 0;
 503   uint fp_args = 0;
 504   uint stk_args = 0; // inc by 2 each time
 505 
 506   for (int i = 0; i < total_args_passed; i++) {
 507     switch (sig_bt[i]) {
 508     case T_BOOLEAN:
 509     case T_CHAR:
 510     case T_BYTE:
 511     case T_SHORT:
 512     case T_INT:
 513       if (int_args < Argument::n_int_register_parameters_j) {
 514         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 515       } else {
 516         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 517         stk_args += 2;
 518       }
 519       break;
 520     case T_VOID:
 521       // halves of T_LONG or T_DOUBLE
 522       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 523       regs[i].set_bad();
 524       break;
 525     case T_LONG:
 526       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 527       // fall through
 528     case T_OBJECT:
 529     case T_ARRAY:
 530     case T_ADDRESS:
 531       if (int_args < Argument::n_int_register_parameters_j) {
 532         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 533       } else {
 534         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 535         stk_args += 2;
 536       }
 537       break;
 538     case T_FLOAT:
 539       if (fp_args < Argument::n_float_register_parameters_j) {
 540         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 541       } else {
 542         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 543         stk_args += 2;
 544       }
 545       break;
 546     case T_DOUBLE:
 547       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 548       if (fp_args < Argument::n_float_register_parameters_j) {
 549         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 550       } else {
 551         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 552         stk_args += 2;
 553       }
 554       break;
 555     default:
 556       ShouldNotReachHere();
 557       break;
 558     }
 559   }
 560 
 561   return align_up(stk_args, 2);
 562 }
 563 
 564 // Patch the callers callsite with entry to compiled code if it exists.
 565 static void patch_callers_callsite(MacroAssembler *masm) {
 566   Label L;
 567   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 568   __ jcc(Assembler::equal, L);
 569 
 570   // Save the current stack pointer
 571   __ mov(r13, rsp);
 572   // Schedule the branch target address early.
 573   // Call into the VM to patch the caller, then jump to compiled callee
 574   // rax isn't live so capture return address while we easily can
 575   __ movptr(rax, Address(rsp, 0));
 576 
 577   // align stack so push_CPU_state doesn't fault
 578   __ andptr(rsp, -(StackAlignmentInBytes));
 579   __ push_CPU_state();
 580   __ vzeroupper();
 581   // VM needs caller's callsite
 582   // VM needs target method
 583   // This needs to be a long call since we will relocate this adapter to
 584   // the codeBuffer and it may not reach
 585 
 586   // Allocate argument register save area
 587   if (frame::arg_reg_save_area_bytes != 0) {
 588     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 589   }
 590   __ mov(c_rarg0, rbx);
 591   __ mov(c_rarg1, rax);
 592   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 593 
 594   // De-allocate argument register save area
 595   if (frame::arg_reg_save_area_bytes != 0) {
 596     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 597   }
 598 
 599   __ vzeroupper();
 600   __ pop_CPU_state();
 601   // restore sp
 602   __ mov(rsp, r13);
 603   __ bind(L);
 604 }
 605 
 606 
 607 static void gen_c2i_adapter(MacroAssembler *masm,
 608                             int total_args_passed,
 609                             int comp_args_on_stack,
 610                             const BasicType *sig_bt,
 611                             const VMRegPair *regs,
 612                             Label& skip_fixup) {
 613   // Before we get into the guts of the C2I adapter, see if we should be here
 614   // at all.  We've come from compiled code and are attempting to jump to the
 615   // interpreter, which means the caller made a static call to get here
 616   // (vcalls always get a compiled target if there is one).  Check for a
 617   // compiled target.  If there is one, we need to patch the caller's call.
 618   patch_callers_callsite(masm);
 619 
 620   __ bind(skip_fixup);
 621 
 622   // Since all args are passed on the stack, total_args_passed *
 623   // Interpreter::stackElementSize is the space we need. Plus 1 because
 624   // we also account for the return address location since
 625   // we store it first rather than hold it in rax across all the shuffling
 626 
 627   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 628 
 629   // stack is aligned, keep it that way
 630   extraspace = align_up(extraspace, 2*wordSize);
 631 
 632   // Get return address
 633   __ pop(rax);
 634 
 635   // set senderSP value
 636   __ mov(r13, rsp);
 637 
 638   __ subptr(rsp, extraspace);
 639 
 640   // Store the return address in the expected location
 641   __ movptr(Address(rsp, 0), rax);
 642 
 643   // Now write the args into the outgoing interpreter space
 644   for (int i = 0; i < total_args_passed; i++) {
 645     if (sig_bt[i] == T_VOID) {
 646       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 647       continue;
 648     }
 649 
 650     // offset to start parameters
 651     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 652     int next_off = st_off - Interpreter::stackElementSize;
 653 
 654     // Say 4 args:
 655     // i   st_off
 656     // 0   32 T_LONG
 657     // 1   24 T_VOID
 658     // 2   16 T_OBJECT
 659     // 3    8 T_BOOL
 660     // -    0 return address
 661     //
 662     // However to make thing extra confusing. Because we can fit a long/double in
 663     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 664     // leaves one slot empty and only stores to a single slot. In this case the
 665     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 666 
 667     VMReg r_1 = regs[i].first();
 668     VMReg r_2 = regs[i].second();
 669     if (!r_1->is_valid()) {
 670       assert(!r_2->is_valid(), "");
 671       continue;
 672     }
 673     if (r_1->is_stack()) {
 674       // memory to memory use rax
 675       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 676       if (!r_2->is_valid()) {
 677         // sign extend??
 678         __ movl(rax, Address(rsp, ld_off));
 679         __ movptr(Address(rsp, st_off), rax);
 680 
 681       } else {
 682 
 683         __ movq(rax, Address(rsp, ld_off));
 684 
 685         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 686         // T_DOUBLE and T_LONG use two slots in the interpreter
 687         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 688           // ld_off == LSW, ld_off+wordSize == MSW
 689           // st_off == MSW, next_off == LSW
 690           __ movq(Address(rsp, next_off), rax);
 691 #ifdef ASSERT
 692           // Overwrite the unused slot with known junk
 693           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 694           __ movptr(Address(rsp, st_off), rax);
 695 #endif /* ASSERT */
 696         } else {
 697           __ movq(Address(rsp, st_off), rax);
 698         }
 699       }
 700     } else if (r_1->is_Register()) {
 701       Register r = r_1->as_Register();
 702       if (!r_2->is_valid()) {
 703         // must be only an int (or less ) so move only 32bits to slot
 704         // why not sign extend??
 705         __ movl(Address(rsp, st_off), r);
 706       } else {
 707         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 708         // T_DOUBLE and T_LONG use two slots in the interpreter
 709         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 710           // long/double in gpr
 711 #ifdef ASSERT
 712           // Overwrite the unused slot with known junk
 713           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 714           __ movptr(Address(rsp, st_off), rax);
 715 #endif /* ASSERT */
 716           __ movq(Address(rsp, next_off), r);
 717         } else {
 718           __ movptr(Address(rsp, st_off), r);
 719         }
 720       }
 721     } else {
 722       assert(r_1->is_XMMRegister(), "");
 723       if (!r_2->is_valid()) {
 724         // only a float use just part of the slot
 725         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 726       } else {
 727 #ifdef ASSERT
 728         // Overwrite the unused slot with known junk
 729         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 730         __ movptr(Address(rsp, st_off), rax);
 731 #endif /* ASSERT */
 732         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 733       }
 734     }
 735   }
 736 
 737   // Schedule the branch target address early.
 738   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 739   __ jmp(rcx);
 740 }
 741 
 742 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 743                         address code_start, address code_end,
 744                         Label& L_ok) {
 745   Label L_fail;
 746   __ lea(temp_reg, ExternalAddress(code_start));
 747   __ cmpptr(pc_reg, temp_reg);
 748   __ jcc(Assembler::belowEqual, L_fail);
 749   __ lea(temp_reg, ExternalAddress(code_end));
 750   __ cmpptr(pc_reg, temp_reg);
 751   __ jcc(Assembler::below, L_ok);
 752   __ bind(L_fail);
 753 }
 754 
 755 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 756                                     int total_args_passed,
 757                                     int comp_args_on_stack,
 758                                     const BasicType *sig_bt,
 759                                     const VMRegPair *regs) {
 760 
 761   // Note: r13 contains the senderSP on entry. We must preserve it since
 762   // we may do a i2c -> c2i transition if we lose a race where compiled
 763   // code goes non-entrant while we get args ready.
 764   // In addition we use r13 to locate all the interpreter args as
 765   // we must align the stack to 16 bytes on an i2c entry else we
 766   // lose alignment we expect in all compiled code and register
 767   // save code can segv when fxsave instructions find improperly
 768   // aligned stack pointer.
 769 
 770   // Adapters can be frameless because they do not require the caller
 771   // to perform additional cleanup work, such as correcting the stack pointer.
 772   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 773   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 774   // even if a callee has modified the stack pointer.
 775   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 776   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 777   // up via the senderSP register).
 778   // In other words, if *either* the caller or callee is interpreted, we can
 779   // get the stack pointer repaired after a call.
 780   // This is why c2i and i2c adapters cannot be indefinitely composed.
 781   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 782   // both caller and callee would be compiled methods, and neither would
 783   // clean up the stack pointer changes performed by the two adapters.
 784   // If this happens, control eventually transfers back to the compiled
 785   // caller, but with an uncorrected stack, causing delayed havoc.
 786 
 787   // Pick up the return address
 788   __ movptr(rax, Address(rsp, 0));
 789 
 790   if (VerifyAdapterCalls &&
 791       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 792     // So, let's test for cascading c2i/i2c adapters right now.
 793     //  assert(Interpreter::contains($return_addr) ||
 794     //         StubRoutines::contains($return_addr),
 795     //         "i2c adapter must return to an interpreter frame");
 796     __ block_comment("verify_i2c { ");
 797     Label L_ok;
 798     if (Interpreter::code() != NULL)
 799       range_check(masm, rax, r11,
 800                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 801                   L_ok);
 802     if (StubRoutines::code1() != NULL)
 803       range_check(masm, rax, r11,
 804                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 805                   L_ok);
 806     if (StubRoutines::code2() != NULL)
 807       range_check(masm, rax, r11,
 808                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 809                   L_ok);
 810     const char* msg = "i2c adapter must return to an interpreter frame";
 811     __ block_comment(msg);
 812     __ stop(msg);
 813     __ bind(L_ok);
 814     __ block_comment("} verify_i2ce ");
 815   }
 816 
 817   // Must preserve original SP for loading incoming arguments because
 818   // we need to align the outgoing SP for compiled code.
 819   __ movptr(r11, rsp);
 820 
 821   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
 822   // in registers, we will occasionally have no stack args.
 823   int comp_words_on_stack = 0;
 824   if (comp_args_on_stack) {
 825     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
 826     // registers are below.  By subtracting stack0, we either get a negative
 827     // number (all values in registers) or the maximum stack slot accessed.
 828 
 829     // Convert 4-byte c2 stack slots to words.
 830     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 831     // Round up to miminum stack alignment, in wordSize
 832     comp_words_on_stack = align_up(comp_words_on_stack, 2);
 833     __ subptr(rsp, comp_words_on_stack * wordSize);
 834   }
 835 
 836 
 837   // Ensure compiled code always sees stack at proper alignment
 838   __ andptr(rsp, -16);
 839 
 840   // push the return address and misalign the stack that youngest frame always sees
 841   // as far as the placement of the call instruction
 842   __ push(rax);
 843 
 844   // Put saved SP in another register
 845   const Register saved_sp = rax;
 846   __ movptr(saved_sp, r11);
 847 
 848   // Will jump to the compiled code just as if compiled code was doing it.
 849   // Pre-load the register-jump target early, to schedule it better.
 850   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 851 
 852 #if INCLUDE_JVMCI
 853   if (EnableJVMCI) {
 854     // check if this call should be routed towards a specific entry point
 855     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 856     Label no_alternative_target;
 857     __ jcc(Assembler::equal, no_alternative_target);
 858     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 859     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 860     __ bind(no_alternative_target);
 861   }
 862 #endif // INCLUDE_JVMCI
 863 
 864   // Now generate the shuffle code.  Pick up all register args and move the
 865   // rest through the floating point stack top.
 866   for (int i = 0; i < total_args_passed; i++) {
 867     if (sig_bt[i] == T_VOID) {
 868       // Longs and doubles are passed in native word order, but misaligned
 869       // in the 32-bit build.
 870       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 871       continue;
 872     }
 873 
 874     // Pick up 0, 1 or 2 words from SP+offset.
 875 
 876     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 877             "scrambled load targets?");
 878     // Load in argument order going down.
 879     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 880     // Point to interpreter value (vs. tag)
 881     int next_off = ld_off - Interpreter::stackElementSize;
 882     //
 883     //
 884     //
 885     VMReg r_1 = regs[i].first();
 886     VMReg r_2 = regs[i].second();
 887     if (!r_1->is_valid()) {
 888       assert(!r_2->is_valid(), "");
 889       continue;
 890     }
 891     if (r_1->is_stack()) {
 892       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 893       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 894 
 895       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 896       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 897       // will be generated.
 898       if (!r_2->is_valid()) {
 899         // sign extend???
 900         __ movl(r13, Address(saved_sp, ld_off));
 901         __ movptr(Address(rsp, st_off), r13);
 902       } else {
 903         //
 904         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 905         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 906         // So we must adjust where to pick up the data to match the interpreter.
 907         //
 908         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 909         // are accessed as negative so LSW is at LOW address
 910 
 911         // ld_off is MSW so get LSW
 912         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 913                            next_off : ld_off;
 914         __ movq(r13, Address(saved_sp, offset));
 915         // st_off is LSW (i.e. reg.first())
 916         __ movq(Address(rsp, st_off), r13);
 917       }
 918     } else if (r_1->is_Register()) {  // Register argument
 919       Register r = r_1->as_Register();
 920       assert(r != rax, "must be different");
 921       if (r_2->is_valid()) {
 922         //
 923         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 924         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 925         // So we must adjust where to pick up the data to match the interpreter.
 926 
 927         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 928                            next_off : ld_off;
 929 
 930         // this can be a misaligned move
 931         __ movq(r, Address(saved_sp, offset));
 932       } else {
 933         // sign extend and use a full word?
 934         __ movl(r, Address(saved_sp, ld_off));
 935       }
 936     } else {
 937       if (!r_2->is_valid()) {
 938         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 939       } else {
 940         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 941       }
 942     }
 943   }
 944 


 945   // 6243940 We might end up in handle_wrong_method if
 946   // the callee is deoptimized as we race thru here. If that
 947   // happens we don't want to take a safepoint because the
 948   // caller frame will look interpreted and arguments are now
 949   // "compiled" so it is much better to make this transition
 950   // invisible to the stack walking code. Unfortunately if
 951   // we try and find the callee by normal means a safepoint
 952   // is possible. So we stash the desired callee in the thread
 953   // and the vm will find there should this case occur.
 954 
 955   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 956 
 957   // put Method* where a c2i would expect should we end up there
 958   // only needed becaus eof c2 resolve stubs return Method* as a result in
 959   // rax
 960   __ mov(rax, rbx);
 961   __ jmp(r11);
 962 }
 963 
 964 // ---------------------------------------------------------------
 965 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 966                                                             int total_args_passed,
 967                                                             int comp_args_on_stack,
 968                                                             const BasicType *sig_bt,
 969                                                             const VMRegPair *regs,
 970                                                             AdapterFingerPrint* fingerprint) {
 971   address i2c_entry = __ pc();
 972 
 973   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 974 
 975   // -------------------------------------------------------------------------
 976   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 977   // to the interpreter.  The args start out packed in the compiled layout.  They
 978   // need to be unpacked into the interpreter layout.  This will almost always
 979   // require some stack space.  We grow the current (compiled) stack, then repack
 980   // the args.  We  finally end in a jump to the generic interpreter entry point.
 981   // On exit from the interpreter, the interpreter will restore our SP (lest the
 982   // compiled code, which relys solely on SP and not RBP, get sick).
 983 
 984   address c2i_unverified_entry = __ pc();
 985   Label skip_fixup;
 986   Label ok;
 987 
 988   Register holder = rax;
 989   Register receiver = j_rarg0;
 990   Register temp = rbx;
 991 
 992   {
 993     __ load_klass(temp, receiver, rscratch1);
 994     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
 995     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
 996     __ jcc(Assembler::equal, ok);
 997     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 998 
 999     __ bind(ok);
1000     // Method might have been compiled since the call site was patched to
1001     // interpreted if that is the case treat it as a miss so we can get
1002     // the call site corrected.
1003     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1004     __ jcc(Assembler::equal, skip_fixup);
1005     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1006   }
1007 
1008   address c2i_entry = __ pc();
1009 
1010   // Class initialization barrier for static methods
1011   address c2i_no_clinit_check_entry = NULL;
1012   if (VM_Version::supports_fast_class_init_checks()) {
1013     Label L_skip_barrier;
1014     Register method = rbx;
1015 
1016     { // Bypass the barrier for non-static methods
1017       Register flags  = rscratch1;
1018       __ movl(flags, Address(method, Method::access_flags_offset()));
1019       __ testl(flags, JVM_ACC_STATIC);
1020       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1021     }
1022 
1023     Register klass = rscratch1;
1024     __ load_method_holder(klass, method);
1025     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1026 
1027     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1028 
1029     __ bind(L_skip_barrier);
1030     c2i_no_clinit_check_entry = __ pc();
1031   }
1032 
1033   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1034   bs->c2i_entry_barrier(masm);
1035 
1036   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1037 
1038   __ flush();
1039   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1040 }
1041 
1042 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1043                                          VMRegPair *regs,
1044                                          VMRegPair *regs2,
1045                                          int total_args_passed) {
1046   assert(regs2 == NULL, "not needed on x86");
1047 // We return the amount of VMRegImpl stack slots we need to reserve for all
1048 // the arguments NOT counting out_preserve_stack_slots.
1049 
1050 // NOTE: These arrays will have to change when c1 is ported
1051 #ifdef _WIN64
1052     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1053       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1054     };
1055     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1056       c_farg0, c_farg1, c_farg2, c_farg3
1057     };
1058 #else
1059     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1060       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1061     };
1062     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1063       c_farg0, c_farg1, c_farg2, c_farg3,
1064       c_farg4, c_farg5, c_farg6, c_farg7
1065     };
1066 #endif // _WIN64
1067 
1068 
1069     uint int_args = 0;
1070     uint fp_args = 0;
1071     uint stk_args = 0; // inc by 2 each time
1072 
1073     for (int i = 0; i < total_args_passed; i++) {
1074       switch (sig_bt[i]) {
1075       case T_BOOLEAN:
1076       case T_CHAR:
1077       case T_BYTE:
1078       case T_SHORT:
1079       case T_INT:
1080         if (int_args < Argument::n_int_register_parameters_c) {
1081           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1082 #ifdef _WIN64
1083           fp_args++;
1084           // Allocate slots for callee to stuff register args the stack.
1085           stk_args += 2;
1086 #endif
1087         } else {
1088           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1089           stk_args += 2;
1090         }
1091         break;
1092       case T_LONG:
1093         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1094         // fall through
1095       case T_OBJECT:
1096       case T_ARRAY:
1097       case T_ADDRESS:
1098       case T_METADATA:
1099         if (int_args < Argument::n_int_register_parameters_c) {
1100           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1101 #ifdef _WIN64
1102           fp_args++;
1103           stk_args += 2;
1104 #endif
1105         } else {
1106           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1107           stk_args += 2;
1108         }
1109         break;
1110       case T_FLOAT:
1111         if (fp_args < Argument::n_float_register_parameters_c) {
1112           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1113 #ifdef _WIN64
1114           int_args++;
1115           // Allocate slots for callee to stuff register args the stack.
1116           stk_args += 2;
1117 #endif
1118         } else {
1119           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1120           stk_args += 2;
1121         }
1122         break;
1123       case T_DOUBLE:
1124         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1125         if (fp_args < Argument::n_float_register_parameters_c) {
1126           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1127 #ifdef _WIN64
1128           int_args++;
1129           // Allocate slots for callee to stuff register args the stack.
1130           stk_args += 2;
1131 #endif
1132         } else {
1133           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1134           stk_args += 2;
1135         }
1136         break;
1137       case T_VOID: // Halves of longs and doubles
1138         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1139         regs[i].set_bad();
1140         break;
1141       default:
1142         ShouldNotReachHere();
1143         break;
1144       }
1145     }
1146 #ifdef _WIN64
1147   // windows abi requires that we always allocate enough stack space
1148   // for 4 64bit registers to be stored down.
1149   if (stk_args < 8) {
1150     stk_args = 8;
1151   }
1152 #endif // _WIN64
1153 
1154   return stk_args;
1155 }
1156 
1157 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1158                                              uint num_bits,
1159                                              uint total_args_passed) {
1160   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1161          "only certain vector sizes are supported for now");
1162 
1163   static const XMMRegister VEC_ArgReg[32] = {
1164      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1165      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1166     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1167     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1168   };
1169 
1170   uint stk_args = 0;
1171   uint fp_args = 0;
1172 
1173   for (uint i = 0; i < total_args_passed; i++) {
1174     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1175     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1176     regs[i].set_pair(vmreg->next(next_val), vmreg);
1177   }
1178 
1179   return stk_args;
1180 }
1181 
1182 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1183   // We always ignore the frame_slots arg and just use the space just below frame pointer
1184   // which by this time is free to use
1185   switch (ret_type) {
1186   case T_FLOAT:
1187     __ movflt(Address(rbp, -wordSize), xmm0);
1188     break;
1189   case T_DOUBLE:
1190     __ movdbl(Address(rbp, -wordSize), xmm0);
1191     break;
1192   case T_VOID:  break;
1193   default: {
1194     __ movptr(Address(rbp, -wordSize), rax);
1195     }
1196   }
1197 }
1198 
1199 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1200   // We always ignore the frame_slots arg and just use the space just below frame pointer
1201   // which by this time is free to use
1202   switch (ret_type) {
1203   case T_FLOAT:
1204     __ movflt(xmm0, Address(rbp, -wordSize));
1205     break;
1206   case T_DOUBLE:
1207     __ movdbl(xmm0, Address(rbp, -wordSize));
1208     break;
1209   case T_VOID:  break;
1210   default: {
1211     __ movptr(rax, Address(rbp, -wordSize));
1212     }
1213   }
1214 }
1215 
1216 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1217     for ( int i = first_arg ; i < arg_count ; i++ ) {
1218       if (args[i].first()->is_Register()) {
1219         __ push(args[i].first()->as_Register());
1220       } else if (args[i].first()->is_XMMRegister()) {
1221         __ subptr(rsp, 2*wordSize);
1222         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1223       }
1224     }
1225 }
1226 
1227 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1228     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1229       if (args[i].first()->is_Register()) {
1230         __ pop(args[i].first()->as_Register());
1231       } else if (args[i].first()->is_XMMRegister()) {
1232         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1233         __ addptr(rsp, 2*wordSize);
1234       }
1235     }
1236 }
1237 
1238 // Different signatures may require very different orders for the move
1239 // to avoid clobbering other arguments.  There's no simple way to
1240 // order them safely.  Compute a safe order for issuing stores and
1241 // break any cycles in those stores.  This code is fairly general but
1242 // it's not necessary on the other platforms so we keep it in the
1243 // platform dependent code instead of moving it into a shared file.
1244 // (See bugs 7013347 & 7145024.)
1245 // Note that this code is specific to LP64.
1246 class ComputeMoveOrder: public StackObj {
1247   class MoveOperation: public ResourceObj {
1248     friend class ComputeMoveOrder;
1249    private:
1250     VMRegPair        _src;
1251     VMRegPair        _dst;
1252     int              _src_index;
1253     int              _dst_index;
1254     bool             _processed;
1255     MoveOperation*  _next;
1256     MoveOperation*  _prev;
1257 
1258     static int get_id(VMRegPair r) {
1259       return r.first()->value();
1260     }
1261 
1262    public:
1263     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1264       _src(src)
1265     , _dst(dst)
1266     , _src_index(src_index)
1267     , _dst_index(dst_index)
1268     , _processed(false)
1269     , _next(NULL)
1270     , _prev(NULL) {
1271     }
1272 
1273     VMRegPair src() const              { return _src; }
1274     int src_id() const                 { return get_id(src()); }
1275     int src_index() const              { return _src_index; }
1276     VMRegPair dst() const              { return _dst; }
1277     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1278     int dst_index() const              { return _dst_index; }
1279     int dst_id() const                 { return get_id(dst()); }
1280     MoveOperation* next() const       { return _next; }
1281     MoveOperation* prev() const       { return _prev; }
1282     void set_processed()               { _processed = true; }
1283     bool is_processed() const          { return _processed; }
1284 
1285     // insert
1286     void break_cycle(VMRegPair temp_register) {
1287       // create a new store following the last store
1288       // to move from the temp_register to the original
1289       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1290 
1291       // break the cycle of links and insert new_store at the end
1292       // break the reverse link.
1293       MoveOperation* p = prev();
1294       assert(p->next() == this, "must be");
1295       _prev = NULL;
1296       p->_next = new_store;
1297       new_store->_prev = p;
1298 
1299       // change the original store to save it's value in the temp.
1300       set_dst(-1, temp_register);
1301     }
1302 
1303     void link(GrowableArray<MoveOperation*>& killer) {
1304       // link this store in front the store that it depends on
1305       MoveOperation* n = killer.at_grow(src_id(), NULL);
1306       if (n != NULL) {
1307         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1308         _next = n;
1309         n->_prev = this;
1310       }
1311     }
1312   };
1313 
1314  private:
1315   GrowableArray<MoveOperation*> edges;
1316 
1317  public:
1318   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1319                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1320     // Move operations where the dest is the stack can all be
1321     // scheduled first since they can't interfere with the other moves.
1322     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1323       if (in_sig_bt[i] == T_ARRAY) {
1324         c_arg--;
1325         if (out_regs[c_arg].first()->is_stack() &&
1326             out_regs[c_arg + 1].first()->is_stack()) {
1327           arg_order.push(i);
1328           arg_order.push(c_arg);
1329         } else {
1330           if (out_regs[c_arg].first()->is_stack() ||
1331               in_regs[i].first() == out_regs[c_arg].first()) {
1332             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1333           } else {
1334             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1335           }
1336         }
1337       } else if (in_sig_bt[i] == T_VOID) {
1338         arg_order.push(i);
1339         arg_order.push(c_arg);
1340       } else {
1341         if (out_regs[c_arg].first()->is_stack() ||
1342             in_regs[i].first() == out_regs[c_arg].first()) {
1343           arg_order.push(i);
1344           arg_order.push(c_arg);
1345         } else {
1346           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1347         }
1348       }
1349     }
1350     // Break any cycles in the register moves and emit the in the
1351     // proper order.
1352     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1353     for (int i = 0; i < stores->length(); i++) {
1354       arg_order.push(stores->at(i)->src_index());
1355       arg_order.push(stores->at(i)->dst_index());
1356     }
1357  }
1358 
1359   // Collected all the move operations
1360   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1361     if (src.first() == dst.first()) return;
1362     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1363   }
1364 
1365   // Walk the edges breaking cycles between moves.  The result list
1366   // can be walked in order to produce the proper set of loads
1367   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1368     // Record which moves kill which values
1369     GrowableArray<MoveOperation*> killer;
1370     for (int i = 0; i < edges.length(); i++) {
1371       MoveOperation* s = edges.at(i);
1372       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1373       killer.at_put_grow(s->dst_id(), s, NULL);
1374     }
1375     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1376            "make sure temp isn't in the registers that are killed");
1377 
1378     // create links between loads and stores
1379     for (int i = 0; i < edges.length(); i++) {
1380       edges.at(i)->link(killer);
1381     }
1382 
1383     // at this point, all the move operations are chained together
1384     // in a doubly linked list.  Processing it backwards finds
1385     // the beginning of the chain, forwards finds the end.  If there's
1386     // a cycle it can be broken at any point,  so pick an edge and walk
1387     // backward until the list ends or we end where we started.
1388     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1389     for (int e = 0; e < edges.length(); e++) {
1390       MoveOperation* s = edges.at(e);
1391       if (!s->is_processed()) {
1392         MoveOperation* start = s;
1393         // search for the beginning of the chain or cycle
1394         while (start->prev() != NULL && start->prev() != s) {
1395           start = start->prev();
1396         }
1397         if (start->prev() == s) {
1398           start->break_cycle(temp_register);
1399         }
1400         // walk the chain forward inserting to store list
1401         while (start != NULL) {
1402           stores->append(start);
1403           start->set_processed();
1404           start = start->next();
1405         }
1406       }
1407     }
1408     return stores;
1409   }
1410 };
1411 
1412 static void verify_oop_args(MacroAssembler* masm,
1413                             const methodHandle& method,
1414                             const BasicType* sig_bt,
1415                             const VMRegPair* regs) {
1416   Register temp_reg = rbx;  // not part of any compiled calling seq
1417   if (VerifyOops) {
1418     for (int i = 0; i < method->size_of_parameters(); i++) {
1419       if (is_reference_type(sig_bt[i])) {
1420         VMReg r = regs[i].first();
1421         assert(r->is_valid(), "bad oop arg");
1422         if (r->is_stack()) {
1423           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1424           __ verify_oop(temp_reg);
1425         } else {
1426           __ verify_oop(r->as_Register());
1427         }
1428       }
1429     }
1430   }
1431 }
1432 
































































































1433 static void gen_special_dispatch(MacroAssembler* masm,
1434                                  const methodHandle& method,
1435                                  const BasicType* sig_bt,
1436                                  const VMRegPair* regs) {
1437   verify_oop_args(masm, method, sig_bt, regs);
1438   vmIntrinsics::ID iid = method->intrinsic_id();
1439 
1440   // Now write the args into the outgoing interpreter space
1441   bool     has_receiver   = false;
1442   Register receiver_reg   = noreg;
1443   int      member_arg_pos = -1;
1444   Register member_reg     = noreg;
1445   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1446   if (ref_kind != 0) {
1447     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1448     member_reg = rbx;  // known to be free at this point
1449     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1450   } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1451     has_receiver = true;
1452   } else {
1453     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1454   }
1455 
1456   if (member_reg != noreg) {
1457     // Load the member_arg into register, if necessary.
1458     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1459     VMReg r = regs[member_arg_pos].first();
1460     if (r->is_stack()) {
1461       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1462     } else {
1463       // no data motion is needed
1464       member_reg = r->as_Register();
1465     }
1466   }
1467 
1468   if (has_receiver) {
1469     // Make sure the receiver is loaded into a register.
1470     assert(method->size_of_parameters() > 0, "oob");
1471     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1472     VMReg r = regs[0].first();
1473     assert(r->is_valid(), "bad receiver arg");
1474     if (r->is_stack()) {
1475       // Porting note:  This assumes that compiled calling conventions always
1476       // pass the receiver oop in a register.  If this is not true on some
1477       // platform, pick a temp and load the receiver from stack.
1478       fatal("receiver always in a register");
1479       receiver_reg = j_rarg0;  // known to be free at this point
1480       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1481     } else {
1482       // no data motion is needed
1483       receiver_reg = r->as_Register();
1484     }
1485   }
1486 
1487   // Figure out which address we are really jumping to:
1488   MethodHandles::generate_method_handle_dispatch(masm, iid,
1489                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1490 }
1491 
1492 // ---------------------------------------------------------------------------
1493 // Generate a native wrapper for a given method.  The method takes arguments
1494 // in the Java compiled code convention, marshals them to the native
1495 // convention (handlizes oops, etc), transitions to native, makes the call,
1496 // returns to java state (possibly blocking), unhandlizes any result and
1497 // returns.
1498 //
1499 // Critical native functions are a shorthand for the use of
1500 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1501 // functions.  The wrapper is expected to unpack the arguments before
1502 // passing them to the callee. Critical native functions leave the state _in_Java,
1503 // since they cannot stop for GC.
1504 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1505 // block and the check for pending exceptions it's impossible for them
1506 // to be thrown.
1507 //
1508 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1509                                                 const methodHandle& method,
1510                                                 int compile_id,
1511                                                 BasicType* in_sig_bt,
1512                                                 VMRegPair* in_regs,
1513                                                 BasicType ret_type) {































1514   if (method->is_method_handle_intrinsic()) {
1515     vmIntrinsics::ID iid = method->intrinsic_id();
1516     intptr_t start = (intptr_t)__ pc();
1517     int vep_offset = ((intptr_t)__ pc()) - start;
1518     gen_special_dispatch(masm,
1519                          method,
1520                          in_sig_bt,
1521                          in_regs);
1522     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1523     __ flush();
1524     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1525     return nmethod::new_native_nmethod(method,
1526                                        compile_id,
1527                                        masm->code(),
1528                                        vep_offset,
1529                                        frame_complete,
1530                                        stack_slots / VMRegImpl::slots_per_word,
1531                                        in_ByteSize(-1),
1532                                        in_ByteSize(-1),
1533                                        (OopMapSet*)NULL);
1534   }
1535   address native_func = method->native_function();
1536   assert(native_func != NULL, "must have function");
1537 
1538   // An OopMap for lock (and class if static)
1539   OopMapSet *oop_maps = new OopMapSet();
1540   intptr_t start = (intptr_t)__ pc();
1541 
1542   // We have received a description of where all the java arg are located
1543   // on entry to the wrapper. We need to convert these args to where
1544   // the jni function will expect them. To figure out where they go
1545   // we convert the java signature to a C signature by inserting
1546   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1547 
1548   const int total_in_args = method->size_of_parameters();
1549   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1550 
1551   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1552   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1553   BasicType* in_elem_bt = NULL;
1554 
1555   int argc = 0;
1556   out_sig_bt[argc++] = T_ADDRESS;
1557   if (method->is_static()) {
1558     out_sig_bt[argc++] = T_OBJECT;
1559   }
1560 
1561   for (int i = 0; i < total_in_args ; i++ ) {
1562     out_sig_bt[argc++] = in_sig_bt[i];
1563   }
1564 
1565   // Now figure out where the args must be stored and how much stack space
1566   // they require.
1567   int out_arg_slots;
1568   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1569 
1570   // Compute framesize for the wrapper.  We need to handlize all oops in
1571   // incoming registers
1572 
1573   // Calculate the total number of stack slots we will need.
1574 
1575   // First count the abi requirement plus all of the outgoing args
1576   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1577 
1578   // Now the space for the inbound oop handle area
1579   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1580 
1581   int oop_handle_offset = stack_slots;
1582   stack_slots += total_save_slots;
1583 
1584   // Now any space we need for handlizing a klass if static method
1585 
1586   int klass_slot_offset = 0;
1587   int klass_offset = -1;
1588   int lock_slot_offset = 0;
1589   bool is_static = false;
1590 
1591   if (method->is_static()) {
1592     klass_slot_offset = stack_slots;
1593     stack_slots += VMRegImpl::slots_per_word;
1594     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1595     is_static = true;
1596   }
1597 
1598   // Plus a lock if needed
1599 
1600   if (method->is_synchronized()) {
1601     lock_slot_offset = stack_slots;
1602     stack_slots += VMRegImpl::slots_per_word;
1603   }
1604 
1605   // Now a place (+2) to save return values or temp during shuffling
1606   // + 4 for return address (which we own) and saved rbp
1607   stack_slots += 6;
1608 
1609   // Ok The space we have allocated will look like:
1610   //
1611   //
1612   // FP-> |                     |
1613   //      |---------------------|
1614   //      | 2 slots for moves   |
1615   //      |---------------------|
1616   //      | lock box (if sync)  |
1617   //      |---------------------| <- lock_slot_offset
1618   //      | klass (if static)   |
1619   //      |---------------------| <- klass_slot_offset
1620   //      | oopHandle area      |
1621   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1622   //      | outbound memory     |
1623   //      | based arguments     |
1624   //      |                     |
1625   //      |---------------------|
1626   //      |                     |
1627   // SP-> | out_preserved_slots |
1628   //
1629   //
1630 
1631 
1632   // Now compute actual number of stack words we need rounding to make
1633   // stack properly aligned.
1634   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1635 
1636   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1637 
1638   // First thing make an ic check to see if we should even be here
1639 
1640   // We are free to use all registers as temps without saving them and
1641   // restoring them except rbp. rbp is the only callee save register
1642   // as far as the interpreter and the compiler(s) are concerned.
1643 
1644 
1645   const Register ic_reg = rax;
1646   const Register receiver = j_rarg0;
1647 
1648   Label hit;
1649   Label exception_pending;
1650 
1651   assert_different_registers(ic_reg, receiver, rscratch1);
1652   __ verify_oop(receiver);
1653   __ load_klass(rscratch1, receiver, rscratch2);
1654   __ cmpq(ic_reg, rscratch1);
1655   __ jcc(Assembler::equal, hit);
1656 
1657   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1658 
1659   // Verified entry point must be aligned
1660   __ align(8);
1661 
1662   __ bind(hit);
1663 
1664   int vep_offset = ((intptr_t)__ pc()) - start;
1665 
1666   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1667     Label L_skip_barrier;
1668     Register klass = r10;
1669     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1670     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1671 
1672     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1673 
1674     __ bind(L_skip_barrier);
1675   }
1676 
1677 #ifdef COMPILER1
1678   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1679   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1680     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1681   }
1682 #endif // COMPILER1
1683 
1684   // The instruction at the verified entry point must be 5 bytes or longer
1685   // because it can be patched on the fly by make_non_entrant. The stack bang
1686   // instruction fits that requirement.
1687 
1688   // Generate stack overflow check
1689   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1690 
1691   // Generate a new frame for the wrapper.
1692   __ enter();
1693   // -2 because return address is already present and so is saved rbp
1694   __ subptr(rsp, stack_size - 2*wordSize);
1695 
1696   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1697   bs->nmethod_entry_barrier(masm);
1698 
1699   // Frame is now completed as far as size and linkage.
1700   int frame_complete = ((intptr_t)__ pc()) - start;
1701 
1702     if (UseRTMLocking) {
1703       // Abort RTM transaction before calling JNI
1704       // because critical section will be large and will be
1705       // aborted anyway. Also nmethod could be deoptimized.
1706       __ xabort(0);
1707     }
1708 
1709 #ifdef ASSERT
1710     {
1711       Label L;
1712       __ mov(rax, rsp);
1713       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
1714       __ cmpptr(rax, rsp);
1715       __ jcc(Assembler::equal, L);
1716       __ stop("improperly aligned stack");
1717       __ bind(L);
1718     }
1719 #endif /* ASSERT */
1720 
1721 
1722   // We use r14 as the oop handle for the receiver/klass
1723   // It is callee save so it survives the call to native
1724 
1725   const Register oop_handle_reg = r14;
1726 
1727   //
1728   // We immediately shuffle the arguments so that any vm call we have to
1729   // make from here on out (sync slow path, jvmti, etc.) we will have
1730   // captured the oops from our caller and have a valid oopMap for
1731   // them.
1732 
1733   // -----------------
1734   // The Grand Shuffle
1735 
1736   // The Java calling convention is either equal (linux) or denser (win64) than the
1737   // c calling convention. However the because of the jni_env argument the c calling
1738   // convention always has at least one more (and two for static) arguments than Java.
1739   // Therefore if we move the args from java -> c backwards then we will never have
1740   // a register->register conflict and we don't have to build a dependency graph
1741   // and figure out how to break any cycles.
1742   //
1743 
1744   // Record esp-based slot for receiver on stack for non-static methods
1745   int receiver_offset = -1;
1746 
1747   // This is a trick. We double the stack slots so we can claim
1748   // the oops in the caller's frame. Since we are sure to have
1749   // more args than the caller doubling is enough to make
1750   // sure we can capture all the incoming oop args from the
1751   // caller.
1752   //
1753   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1754 
1755   // Mark location of rbp (someday)
1756   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1757 
1758   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1759   // All inbound args are referenced based on rbp and all outbound args via rsp.
1760 
1761 
1762 #ifdef ASSERT
1763   bool reg_destroyed[RegisterImpl::number_of_registers];
1764   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
1765   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
1766     reg_destroyed[r] = false;
1767   }
1768   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
1769     freg_destroyed[f] = false;
1770   }
1771 
1772 #endif /* ASSERT */
1773 
1774   // For JNI natives the incoming and outgoing registers are offset upwards.
1775   GrowableArray<int> arg_order(2 * total_in_args);
1776 
1777   VMRegPair tmp_vmreg;
1778   tmp_vmreg.set2(rbx->as_VMReg());
1779 
1780   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1781     arg_order.push(i);
1782     arg_order.push(c_arg);
1783   }
1784 
1785   int temploc = -1;
1786   for (int ai = 0; ai < arg_order.length(); ai += 2) {
1787     int i = arg_order.at(ai);
1788     int c_arg = arg_order.at(ai + 1);
1789     __ block_comment(err_msg("move %d -> %d", i, c_arg));
1790 #ifdef ASSERT
1791     if (in_regs[i].first()->is_Register()) {
1792       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1793     } else if (in_regs[i].first()->is_XMMRegister()) {
1794       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1795     }
1796     if (out_regs[c_arg].first()->is_Register()) {
1797       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1798     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1799       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1800     }
1801 #endif /* ASSERT */
1802     switch (in_sig_bt[i]) {
1803       case T_ARRAY:
1804       case T_OBJECT:
1805         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1806                     ((i == 0) && (!is_static)),
1807                     &receiver_offset);
1808         break;
1809       case T_VOID:
1810         break;
1811 
1812       case T_FLOAT:
1813         __ float_move(in_regs[i], out_regs[c_arg]);
1814           break;
1815 
1816       case T_DOUBLE:
1817         assert( i + 1 < total_in_args &&
1818                 in_sig_bt[i + 1] == T_VOID &&
1819                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1820         __ double_move(in_regs[i], out_regs[c_arg]);
1821         break;
1822 
1823       case T_LONG :
1824         __ long_move(in_regs[i], out_regs[c_arg]);
1825         break;
1826 
1827       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1828 
1829       default:
1830         __ move32_64(in_regs[i], out_regs[c_arg]);
1831     }
1832   }
1833 
1834   int c_arg;
1835 
1836   // Pre-load a static method's oop into r14.  Used both by locking code and
1837   // the normal JNI call code.
1838   // point c_arg at the first arg that is already loaded in case we
1839   // need to spill before we call out
1840   c_arg = total_c_args - total_in_args;
1841 
1842   if (method->is_static()) {
1843 
1844     //  load oop into a register
1845     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1846 
1847     // Now handlize the static class mirror it's known not-null.
1848     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1849     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1850 
1851     // Now get the handle
1852     __ lea(oop_handle_reg, Address(rsp, klass_offset));
1853     // store the klass handle as second argument
1854     __ movptr(c_rarg1, oop_handle_reg);
1855     // and protect the arg if we must spill
1856     c_arg--;
1857   }
1858 
1859   // Change state to native (we save the return address in the thread, since it might not
1860   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
1861   // points into the right code segment. It does not have to be the correct return pc.
1862   // We use the same pc/oopMap repeatedly when we call out
1863 
1864   intptr_t the_pc = (intptr_t) __ pc();
1865   oop_maps->add_gc_map(the_pc - start, map);
1866 
1867   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
1868 
1869 
1870   // We have all of the arguments setup at this point. We must not touch any register
1871   // argument registers at this point (what if we save/restore them there are no oop?
1872 
1873   {
1874     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
1875     // protect the args we've loaded
1876     save_args(masm, total_c_args, c_arg, out_regs);
1877     __ mov_metadata(c_rarg1, method());
1878     __ call_VM_leaf(
1879       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
1880       r15_thread, c_rarg1);
1881     restore_args(masm, total_c_args, c_arg, out_regs);
1882   }
1883 
1884   // RedefineClasses() tracing support for obsolete method entry
1885   if (log_is_enabled(Trace, redefine, class, obsolete)) {
1886     // protect the args we've loaded
1887     save_args(masm, total_c_args, c_arg, out_regs);
1888     __ mov_metadata(c_rarg1, method());
1889     __ call_VM_leaf(
1890       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
1891       r15_thread, c_rarg1);
1892     restore_args(masm, total_c_args, c_arg, out_regs);
1893   }
1894 
1895   // Lock a synchronized method
1896 
1897   // Register definitions used by locking and unlocking
1898 
1899   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
1900   const Register obj_reg  = rbx;  // Will contain the oop
1901   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
1902   const Register old_hdr  = r13;  // value of old header at unlock time
1903 
1904   Label slow_path_lock;
1905   Label lock_done;
1906 
1907   if (method->is_synchronized()) {
1908 
1909     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
1910 
1911     // Get the handle (the 2nd argument)
1912     __ mov(oop_handle_reg, c_rarg1);
1913 
1914     // Get address of the box
1915 
1916     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
1917 
1918     // Load the oop from the handle
1919     __ movptr(obj_reg, Address(oop_handle_reg, 0));
1920 
1921     // Load immediate 1 into swap_reg %rax
1922     __ movl(swap_reg, 1);
1923 
1924     // Load (object->mark() | 1) into swap_reg %rax
1925     __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1926 
1927     // Save (object->mark() | 1) into BasicLock's displaced header
1928     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
1929 
1930     // src -> dest iff dest == rax else rax <- dest
1931     __ lock();
1932     __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1933     __ jcc(Assembler::equal, lock_done);
1934 
1935     // Hmm should this move to the slow path code area???
1936 
1937     // Test if the oopMark is an obvious stack pointer, i.e.,
1938     //  1) (mark & 3) == 0, and
1939     //  2) rsp <= mark < mark + os::pagesize()
1940     // These 3 tests can be done by evaluating the following
1941     // expression: ((mark - rsp) & (3 - os::vm_page_size())),
1942     // assuming both stack pointer and pagesize have their
1943     // least significant 2 bits clear.
1944     // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
1945 
1946     __ subptr(swap_reg, rsp);
1947     __ andptr(swap_reg, 3 - os::vm_page_size());
1948 
1949     // Save the test result, for recursive case, the result is zero
1950     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
1951     __ jcc(Assembler::notEqual, slow_path_lock);
1952 
1953     // Slow path will re-enter here
1954 
1955     __ bind(lock_done);

1956   }
1957 
1958   // Finally just about ready to make the JNI call
1959 
1960   // get JNIEnv* which is first argument to native
1961   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
1962 
1963   // Now set thread in native
1964   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
1965 
1966   __ call(RuntimeAddress(native_func));
1967 
1968   // Verify or restore cpu control state after JNI call
1969   __ restore_cpu_control_state_after_jni();
1970 
1971   // Unpack native results.
1972   switch (ret_type) {
1973   case T_BOOLEAN: __ c2bool(rax);            break;
1974   case T_CHAR   : __ movzwl(rax, rax);      break;
1975   case T_BYTE   : __ sign_extend_byte (rax); break;
1976   case T_SHORT  : __ sign_extend_short(rax); break;
1977   case T_INT    : /* nothing to do */        break;
1978   case T_DOUBLE :
1979   case T_FLOAT  :
1980     // Result is in xmm0 we'll save as needed
1981     break;
1982   case T_ARRAY:                 // Really a handle
1983   case T_OBJECT:                // Really a handle
1984       break; // can't de-handlize until after safepoint check
1985   case T_VOID: break;
1986   case T_LONG: break;
1987   default       : ShouldNotReachHere();
1988   }
1989 
1990   Label after_transition;
1991 
1992   // Switch thread to "native transition" state before reading the synchronization state.
1993   // This additional state is necessary because reading and testing the synchronization
1994   // state is not atomic w.r.t. GC, as this scenario demonstrates:
1995   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
1996   //     VM thread changes sync state to synchronizing and suspends threads for GC.
1997   //     Thread A is resumed to finish this native method, but doesn't block here since it
1998   //     didn't see any synchronization is progress, and escapes.
1999   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2000 
2001   // Force this write out before the read below
2002   __ membar(Assembler::Membar_mask_bits(
2003               Assembler::LoadLoad | Assembler::LoadStore |
2004               Assembler::StoreLoad | Assembler::StoreStore));
2005 
2006   // check for safepoint operation in progress and/or pending suspend requests
2007   {
2008     Label Continue;
2009     Label slow_path;
2010 
2011     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2012 
2013     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2014     __ jcc(Assembler::equal, Continue);
2015     __ bind(slow_path);
2016 
2017     // Don't use call_VM as it will see a possible pending exception and forward it
2018     // and never return here preventing us from clearing _last_native_pc down below.
2019     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2020     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2021     // by hand.
2022     //
2023     __ vzeroupper();
2024     save_native_result(masm, ret_type, stack_slots);
2025     __ mov(c_rarg0, r15_thread);
2026     __ mov(r12, rsp); // remember sp
2027     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2028     __ andptr(rsp, -16); // align stack as required by ABI
2029     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2030     __ mov(rsp, r12); // restore sp
2031     __ reinit_heapbase();
2032     // Restore any method result value
2033     restore_native_result(masm, ret_type, stack_slots);
2034     __ bind(Continue);
2035   }
2036 
2037   // change thread state
2038   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2039   __ bind(after_transition);
2040 
2041   Label reguard;
2042   Label reguard_done;
2043   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2044   __ jcc(Assembler::equal, reguard);
2045   __ bind(reguard_done);
2046 
2047   // native result if any is live
2048 
2049   // Unlock
2050   Label unlock_done;
2051   Label slow_path_unlock;
2052   if (method->is_synchronized()) {
2053 
2054     // Get locked oop from the handle we passed to jni
2055     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2056 
2057     Label done;
2058     // Simple recursive lock?
2059 
2060     __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2061     __ jcc(Assembler::equal, done);
2062 
2063     // Must save rax if if it is live now because cmpxchg must use it
2064     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2065       save_native_result(masm, ret_type, stack_slots);
2066     }
2067 
2068 
2069     // get address of the stack lock
2070     __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2071     //  get old displaced header
2072     __ movptr(old_hdr, Address(rax, 0));
2073 
2074     // Atomic swap old header if oop still contains the stack lock
2075     __ lock();
2076     __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2077     __ jcc(Assembler::notEqual, slow_path_unlock);
2078 
2079     // slow path re-enters here
2080     __ bind(unlock_done);
2081     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2082       restore_native_result(masm, ret_type, stack_slots);
2083     }
2084 
2085     __ bind(done);
2086 
2087   }
2088   {
2089     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2090     save_native_result(masm, ret_type, stack_slots);
2091     __ mov_metadata(c_rarg1, method());
2092     __ call_VM_leaf(
2093          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2094          r15_thread, c_rarg1);
2095     restore_native_result(masm, ret_type, stack_slots);
2096   }
2097 
2098   __ reset_last_Java_frame(false);
2099 
2100   // Unbox oop result, e.g. JNIHandles::resolve value.
2101   if (is_reference_type(ret_type)) {
2102     __ resolve_jobject(rax /* value */,
2103                        r15_thread /* thread */,
2104                        rcx /* tmp */);
2105   }
2106 
2107   if (CheckJNICalls) {
2108     // clear_pending_jni_exception_check
2109     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2110   }
2111 
2112   // reset handle block
2113   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2114   __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2115 
2116   // pop our frame
2117 
2118   __ leave();
2119 
2120   // Any exception pending?
2121   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2122   __ jcc(Assembler::notEqual, exception_pending);
2123 
2124   // Return
2125 
2126   __ ret(0);
2127 
2128   // Unexpected paths are out of line and go here
2129 
2130   // forward the exception
2131   __ bind(exception_pending);
2132 
2133   // and forward the exception
2134   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2135 
2136   // Slow path locking & unlocking
2137   if (method->is_synchronized()) {
2138 
2139     // BEGIN Slow path lock
2140     __ bind(slow_path_lock);
2141 
2142     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2143     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2144 
2145     // protect the args we've loaded
2146     save_args(masm, total_c_args, c_arg, out_regs);
2147 
2148     __ mov(c_rarg0, obj_reg);
2149     __ mov(c_rarg1, lock_reg);
2150     __ mov(c_rarg2, r15_thread);
2151 
2152     // Not a leaf but we have last_Java_frame setup as we want
2153     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2154     restore_args(masm, total_c_args, c_arg, out_regs);
2155 
2156 #ifdef ASSERT
2157     { Label L;
2158     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2159     __ jcc(Assembler::equal, L);
2160     __ stop("no pending exception allowed on exit from monitorenter");
2161     __ bind(L);
2162     }
2163 #endif
2164     __ jmp(lock_done);
2165 
2166     // END Slow path lock
2167 
2168     // BEGIN Slow path unlock
2169     __ bind(slow_path_unlock);
2170 
2171     // If we haven't already saved the native result we must save it now as xmm registers
2172     // are still exposed.
2173     __ vzeroupper();
2174     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2175       save_native_result(masm, ret_type, stack_slots);
2176     }
2177 
2178     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2179 
2180     __ mov(c_rarg0, obj_reg);
2181     __ mov(c_rarg2, r15_thread);
2182     __ mov(r12, rsp); // remember sp
2183     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2184     __ andptr(rsp, -16); // align stack as required by ABI
2185 
2186     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2187     // NOTE that obj_reg == rbx currently
2188     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2189     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2190 
2191     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2192     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2193     __ mov(rsp, r12); // restore sp
2194     __ reinit_heapbase();
2195 #ifdef ASSERT
2196     {
2197       Label L;
2198       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2199       __ jcc(Assembler::equal, L);
2200       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2201       __ bind(L);
2202     }
2203 #endif /* ASSERT */
2204 
2205     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2206 
2207     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2208       restore_native_result(masm, ret_type, stack_slots);
2209     }
2210     __ jmp(unlock_done);
2211 
2212     // END Slow path unlock
2213 
2214   } // synchronized
2215 
2216   // SLOW PATH Reguard the stack if needed
2217 
2218   __ bind(reguard);
2219   __ vzeroupper();
2220   save_native_result(masm, ret_type, stack_slots);
2221   __ mov(r12, rsp); // remember sp
2222   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2223   __ andptr(rsp, -16); // align stack as required by ABI
2224   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2225   __ mov(rsp, r12); // restore sp
2226   __ reinit_heapbase();
2227   restore_native_result(masm, ret_type, stack_slots);
2228   // and continue
2229   __ jmp(reguard_done);
2230 
2231 
2232 
2233   __ flush();
2234 
2235   nmethod *nm = nmethod::new_native_nmethod(method,
2236                                             compile_id,
2237                                             masm->code(),
2238                                             vep_offset,
2239                                             frame_complete,
2240                                             stack_slots / VMRegImpl::slots_per_word,
2241                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2242                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2243                                             oop_maps);
2244 
2245   return nm;
2246 }
2247 
2248 // this function returns the adjust size (in number of words) to a c2i adapter
2249 // activation for use during deoptimization
2250 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2251   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2252 }
2253 
2254 
2255 uint SharedRuntime::out_preserve_stack_slots() {
2256   return 0;
2257 }
2258 
2259 
2260 // Number of stack slots between incoming argument block and the start of
2261 // a new frame.  The PROLOG must add this many slots to the stack.  The
2262 // EPILOG must remove this many slots.  amd64 needs two slots for
2263 // return address.
2264 uint SharedRuntime::in_preserve_stack_slots() {
2265   return 4 + 2 * VerifyStackAtCalls;
2266 }
2267 
2268 //------------------------------generate_deopt_blob----------------------------
2269 void SharedRuntime::generate_deopt_blob() {
2270   // Allocate space for the code
2271   ResourceMark rm;
2272   // Setup code generation tools
2273   int pad = 0;
2274   if (UseAVX > 2) {
2275     pad += 1024;
2276   }
2277 #if INCLUDE_JVMCI
2278   if (EnableJVMCI) {
2279     pad += 512; // Increase the buffer size when compiling for JVMCI
2280   }
2281 #endif
2282   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2283   MacroAssembler* masm = new MacroAssembler(&buffer);
2284   int frame_size_in_words;
2285   OopMap* map = NULL;
2286   OopMapSet *oop_maps = new OopMapSet();
2287 
2288   // -------------
2289   // This code enters when returning to a de-optimized nmethod.  A return
2290   // address has been pushed on the the stack, and return values are in
2291   // registers.
2292   // If we are doing a normal deopt then we were called from the patched
2293   // nmethod from the point we returned to the nmethod. So the return
2294   // address on the stack is wrong by NativeCall::instruction_size
2295   // We will adjust the value so it looks like we have the original return
2296   // address on the stack (like when we eagerly deoptimized).
2297   // In the case of an exception pending when deoptimizing, we enter
2298   // with a return address on the stack that points after the call we patched
2299   // into the exception handler. We have the following register state from,
2300   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2301   //    rax: exception oop
2302   //    rbx: exception handler
2303   //    rdx: throwing pc
2304   // So in this case we simply jam rdx into the useless return address and
2305   // the stack looks just like we want.
2306   //
2307   // At this point we need to de-opt.  We save the argument return
2308   // registers.  We call the first C routine, fetch_unroll_info().  This
2309   // routine captures the return values and returns a structure which
2310   // describes the current frame size and the sizes of all replacement frames.
2311   // The current frame is compiled code and may contain many inlined
2312   // functions, each with their own JVM state.  We pop the current frame, then
2313   // push all the new frames.  Then we call the C routine unpack_frames() to
2314   // populate these frames.  Finally unpack_frames() returns us the new target
2315   // address.  Notice that callee-save registers are BLOWN here; they have
2316   // already been captured in the vframeArray at the time the return PC was
2317   // patched.
2318   address start = __ pc();
2319   Label cont;
2320 
2321   // Prolog for non exception case!
2322 
2323   // Save everything in sight.
2324   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2325 
2326   // Normal deoptimization.  Save exec mode for unpack_frames.
2327   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2328   __ jmp(cont);
2329 
2330   int reexecute_offset = __ pc() - start;
2331 #if INCLUDE_JVMCI && !defined(COMPILER1)
2332   if (EnableJVMCI && UseJVMCICompiler) {
2333     // JVMCI does not use this kind of deoptimization
2334     __ should_not_reach_here();
2335   }
2336 #endif
2337 
2338   // Reexecute case
2339   // return address is the pc describes what bci to do re-execute at
2340 
2341   // No need to update map as each call to save_live_registers will produce identical oopmap
2342   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2343 
2344   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2345   __ jmp(cont);
2346 
2347 #if INCLUDE_JVMCI
2348   Label after_fetch_unroll_info_call;
2349   int implicit_exception_uncommon_trap_offset = 0;
2350   int uncommon_trap_offset = 0;
2351 
2352   if (EnableJVMCI) {
2353     implicit_exception_uncommon_trap_offset = __ pc() - start;
2354 
2355     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2356     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2357 
2358     uncommon_trap_offset = __ pc() - start;
2359 
2360     // Save everything in sight.
2361     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2362     // fetch_unroll_info needs to call last_java_frame()
2363     __ set_last_Java_frame(noreg, noreg, NULL);
2364 
2365     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2366     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2367 
2368     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2369     __ mov(c_rarg0, r15_thread);
2370     __ movl(c_rarg2, r14); // exec mode
2371     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2372     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2373 
2374     __ reset_last_Java_frame(false);
2375 
2376     __ jmp(after_fetch_unroll_info_call);
2377   } // EnableJVMCI
2378 #endif // INCLUDE_JVMCI
2379 
2380   int exception_offset = __ pc() - start;
2381 
2382   // Prolog for exception case
2383 
2384   // all registers are dead at this entry point, except for rax, and
2385   // rdx which contain the exception oop and exception pc
2386   // respectively.  Set them in TLS and fall thru to the
2387   // unpack_with_exception_in_tls entry point.
2388 
2389   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2390   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2391 
2392   int exception_in_tls_offset = __ pc() - start;
2393 
2394   // new implementation because exception oop is now passed in JavaThread
2395 
2396   // Prolog for exception case
2397   // All registers must be preserved because they might be used by LinearScan
2398   // Exceptiop oop and throwing PC are passed in JavaThread
2399   // tos: stack at point of call to method that threw the exception (i.e. only
2400   // args are on the stack, no return address)
2401 
2402   // make room on stack for the return address
2403   // It will be patched later with the throwing pc. The correct value is not
2404   // available now because loading it from memory would destroy registers.
2405   __ push(0);
2406 
2407   // Save everything in sight.
2408   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2409 
2410   // Now it is safe to overwrite any register
2411 
2412   // Deopt during an exception.  Save exec mode for unpack_frames.
2413   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2414 
2415   // load throwing pc from JavaThread and patch it as the return address
2416   // of the current frame. Then clear the field in JavaThread
2417 
2418   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2419   __ movptr(Address(rbp, wordSize), rdx);
2420   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2421 
2422 #ifdef ASSERT
2423   // verify that there is really an exception oop in JavaThread
2424   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2425   __ verify_oop(rax);
2426 
2427   // verify that there is no pending exception
2428   Label no_pending_exception;
2429   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2430   __ testptr(rax, rax);
2431   __ jcc(Assembler::zero, no_pending_exception);
2432   __ stop("must not have pending exception here");
2433   __ bind(no_pending_exception);
2434 #endif
2435 
2436   __ bind(cont);
2437 
2438   // Call C code.  Need thread and this frame, but NOT official VM entry
2439   // crud.  We cannot block on this call, no GC can happen.
2440   //
2441   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2442 
2443   // fetch_unroll_info needs to call last_java_frame().
2444 
2445   __ set_last_Java_frame(noreg, noreg, NULL);
2446 #ifdef ASSERT
2447   { Label L;
2448     __ cmpptr(Address(r15_thread,
2449                     JavaThread::last_Java_fp_offset()),
2450             (int32_t)0);
2451     __ jcc(Assembler::equal, L);
2452     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2453     __ bind(L);
2454   }
2455 #endif // ASSERT
2456   __ mov(c_rarg0, r15_thread);
2457   __ movl(c_rarg1, r14); // exec_mode
2458   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2459 
2460   // Need to have an oopmap that tells fetch_unroll_info where to
2461   // find any register it might need.
2462   oop_maps->add_gc_map(__ pc() - start, map);
2463 
2464   __ reset_last_Java_frame(false);
2465 
2466 #if INCLUDE_JVMCI
2467   if (EnableJVMCI) {
2468     __ bind(after_fetch_unroll_info_call);
2469   }
2470 #endif
2471 
2472   // Load UnrollBlock* into rdi
2473   __ mov(rdi, rax);
2474 
2475   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2476    Label noException;
2477   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2478   __ jcc(Assembler::notEqual, noException);
2479   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2480   // QQQ this is useless it was NULL above
2481   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2482   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2483   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2484 
2485   __ verify_oop(rax);
2486 
2487   // Overwrite the result registers with the exception results.
2488   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2489   // I think this is useless
2490   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2491 
2492   __ bind(noException);
2493 
2494   // Only register save data is on the stack.
2495   // Now restore the result registers.  Everything else is either dead
2496   // or captured in the vframeArray.
2497   RegisterSaver::restore_result_registers(masm);
2498 
2499   // All of the register save area has been popped of the stack. Only the
2500   // return address remains.
2501 
2502   // Pop all the frames we must move/replace.
2503   //
2504   // Frame picture (youngest to oldest)
2505   // 1: self-frame (no frame link)
2506   // 2: deopting frame  (no frame link)
2507   // 3: caller of deopting frame (could be compiled/interpreted).
2508   //
2509   // Note: by leaving the return address of self-frame on the stack
2510   // and using the size of frame 2 to adjust the stack
2511   // when we are done the return to frame 3 will still be on the stack.
2512 
2513   // Pop deoptimized frame
2514   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2515   __ addptr(rsp, rcx);
2516 
2517   // rsp should be pointing at the return address to the caller (3)
2518 
2519   // Pick up the initial fp we should save
2520   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2521   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2522 
2523 #ifdef ASSERT
2524   // Compilers generate code that bang the stack by as much as the
2525   // interpreter would need. So this stack banging should never
2526   // trigger a fault. Verify that it does not on non product builds.
2527   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2528   __ bang_stack_size(rbx, rcx);
2529 #endif
2530 
2531   // Load address of array of frame pcs into rcx
2532   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2533 
2534   // Trash the old pc
2535   __ addptr(rsp, wordSize);
2536 
2537   // Load address of array of frame sizes into rsi
2538   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2539 
2540   // Load counter into rdx
2541   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2542 
2543   // Now adjust the caller's stack to make up for the extra locals
2544   // but record the original sp so that we can save it in the skeletal interpreter
2545   // frame and the stack walking of interpreter_sender will get the unextended sp
2546   // value and not the "real" sp value.
2547 
2548   const Register sender_sp = r8;
2549 
2550   __ mov(sender_sp, rsp);
2551   __ movl(rbx, Address(rdi,
2552                        Deoptimization::UnrollBlock::
2553                        caller_adjustment_offset_in_bytes()));
2554   __ subptr(rsp, rbx);
2555 
2556   // Push interpreter frames in a loop
2557   Label loop;
2558   __ bind(loop);
2559   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2560   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2561   __ pushptr(Address(rcx, 0));          // Save return address
2562   __ enter();                           // Save old & set new ebp
2563   __ subptr(rsp, rbx);                  // Prolog
2564   // This value is corrected by layout_activation_impl
2565   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2566   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2567   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2568   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2569   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2570   __ decrementl(rdx);                   // Decrement counter
2571   __ jcc(Assembler::notZero, loop);
2572   __ pushptr(Address(rcx, 0));          // Save final return address
2573 
2574   // Re-push self-frame
2575   __ enter();                           // Save old & set new ebp
2576 
2577   // Allocate a full sized register save area.
2578   // Return address and rbp are in place, so we allocate two less words.
2579   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2580 
2581   // Restore frame locals after moving the frame
2582   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2583   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2584 
2585   // Call C code.  Need thread but NOT official VM entry
2586   // crud.  We cannot block on this call, no GC can happen.  Call should
2587   // restore return values to their stack-slots with the new SP.
2588   //
2589   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2590 
2591   // Use rbp because the frames look interpreted now
2592   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2593   // Don't need the precise return PC here, just precise enough to point into this code blob.
2594   address the_pc = __ pc();
2595   __ set_last_Java_frame(noreg, rbp, the_pc);
2596 
2597   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2598   __ mov(c_rarg0, r15_thread);
2599   __ movl(c_rarg1, r14); // second arg: exec_mode
2600   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2601   // Revert SP alignment after call since we're going to do some SP relative addressing below
2602   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2603 
2604   // Set an oopmap for the call site
2605   // Use the same PC we used for the last java frame
2606   oop_maps->add_gc_map(the_pc - start,
2607                        new OopMap( frame_size_in_words, 0 ));
2608 
2609   // Clear fp AND pc
2610   __ reset_last_Java_frame(true);
2611 
2612   // Collect return values
2613   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2614   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2615   // I think this is useless (throwing pc?)
2616   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2617 
2618   // Pop self-frame.
2619   __ leave();                           // Epilog
2620 
2621   // Jump to interpreter
2622   __ ret(0);
2623 
2624   // Make sure all code is generated
2625   masm->flush();
2626 
2627   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2628   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2629 #if INCLUDE_JVMCI
2630   if (EnableJVMCI) {
2631     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2632     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2633   }
2634 #endif
2635 }
2636 
2637 #ifdef COMPILER2
2638 //------------------------------generate_uncommon_trap_blob--------------------
2639 void SharedRuntime::generate_uncommon_trap_blob() {
2640   // Allocate space for the code
2641   ResourceMark rm;
2642   // Setup code generation tools
2643   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2644   MacroAssembler* masm = new MacroAssembler(&buffer);
2645 
2646   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2647 
2648   address start = __ pc();
2649 
2650   if (UseRTMLocking) {
2651     // Abort RTM transaction before possible nmethod deoptimization.
2652     __ xabort(0);
2653   }
2654 
2655   // Push self-frame.  We get here with a return address on the
2656   // stack, so rsp is 8-byte aligned until we allocate our frame.
2657   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2658 
2659   // No callee saved registers. rbp is assumed implicitly saved
2660   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2661 
2662   // compiler left unloaded_class_index in j_rarg0 move to where the
2663   // runtime expects it.
2664   __ movl(c_rarg1, j_rarg0);
2665 
2666   __ set_last_Java_frame(noreg, noreg, NULL);
2667 
2668   // Call C code.  Need thread but NOT official VM entry
2669   // crud.  We cannot block on this call, no GC can happen.  Call should
2670   // capture callee-saved registers as well as return values.
2671   // Thread is in rdi already.
2672   //
2673   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2674 
2675   __ mov(c_rarg0, r15_thread);
2676   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2677   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2678 
2679   // Set an oopmap for the call site
2680   OopMapSet* oop_maps = new OopMapSet();
2681   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2682 
2683   // location of rbp is known implicitly by the frame sender code
2684 
2685   oop_maps->add_gc_map(__ pc() - start, map);
2686 
2687   __ reset_last_Java_frame(false);
2688 
2689   // Load UnrollBlock* into rdi
2690   __ mov(rdi, rax);
2691 
2692 #ifdef ASSERT
2693   { Label L;
2694     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2695             (int32_t)Deoptimization::Unpack_uncommon_trap);
2696     __ jcc(Assembler::equal, L);
2697     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
2698     __ bind(L);
2699   }
2700 #endif
2701 
2702   // Pop all the frames we must move/replace.
2703   //
2704   // Frame picture (youngest to oldest)
2705   // 1: self-frame (no frame link)
2706   // 2: deopting frame  (no frame link)
2707   // 3: caller of deopting frame (could be compiled/interpreted).
2708 
2709   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2710   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2711 
2712   // Pop deoptimized frame (int)
2713   __ movl(rcx, Address(rdi,
2714                        Deoptimization::UnrollBlock::
2715                        size_of_deoptimized_frame_offset_in_bytes()));
2716   __ addptr(rsp, rcx);
2717 
2718   // rsp should be pointing at the return address to the caller (3)
2719 
2720   // Pick up the initial fp we should save
2721   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2722   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2723 
2724 #ifdef ASSERT
2725   // Compilers generate code that bang the stack by as much as the
2726   // interpreter would need. So this stack banging should never
2727   // trigger a fault. Verify that it does not on non product builds.
2728   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2729   __ bang_stack_size(rbx, rcx);
2730 #endif
2731 
2732   // Load address of array of frame pcs into rcx (address*)
2733   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2734 
2735   // Trash the return pc
2736   __ addptr(rsp, wordSize);
2737 
2738   // Load address of array of frame sizes into rsi (intptr_t*)
2739   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2740 
2741   // Counter
2742   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2743 
2744   // Now adjust the caller's stack to make up for the extra locals but
2745   // record the original sp so that we can save it in the skeletal
2746   // interpreter frame and the stack walking of interpreter_sender
2747   // will get the unextended sp value and not the "real" sp value.
2748 
2749   const Register sender_sp = r8;
2750 
2751   __ mov(sender_sp, rsp);
2752   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2753   __ subptr(rsp, rbx);
2754 
2755   // Push interpreter frames in a loop
2756   Label loop;
2757   __ bind(loop);
2758   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2759   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2760   __ pushptr(Address(rcx, 0));     // Save return address
2761   __ enter();                      // Save old & set new rbp
2762   __ subptr(rsp, rbx);             // Prolog
2763   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2764             sender_sp);            // Make it walkable
2765   // This value is corrected by layout_activation_impl
2766   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
2767   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
2768   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
2769   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
2770   __ decrementl(rdx);              // Decrement counter
2771   __ jcc(Assembler::notZero, loop);
2772   __ pushptr(Address(rcx, 0));     // Save final return address
2773 
2774   // Re-push self-frame
2775   __ enter();                 // Save old & set new rbp
2776   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2777                               // Prolog
2778 
2779   // Use rbp because the frames look interpreted now
2780   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2781   // Don't need the precise return PC here, just precise enough to point into this code blob.
2782   address the_pc = __ pc();
2783   __ set_last_Java_frame(noreg, rbp, the_pc);
2784 
2785   // Call C code.  Need thread but NOT official VM entry
2786   // crud.  We cannot block on this call, no GC can happen.  Call should
2787   // restore return values to their stack-slots with the new SP.
2788   // Thread is in rdi already.
2789   //
2790   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2791 
2792   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2793   __ mov(c_rarg0, r15_thread);
2794   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2795   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2796 
2797   // Set an oopmap for the call site
2798   // Use the same PC we used for the last java frame
2799   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2800 
2801   // Clear fp AND pc
2802   __ reset_last_Java_frame(true);
2803 
2804   // Pop self-frame.
2805   __ leave();                 // Epilog
2806 
2807   // Jump to interpreter
2808   __ ret(0);
2809 
2810   // Make sure all code is generated
2811   masm->flush();
2812 
2813   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
2814                                                  SimpleRuntimeFrame::framesize >> 1);
2815 }
2816 #endif // COMPILER2
2817 
2818 //------------------------------generate_handler_blob------
2819 //
2820 // Generate a special Compile2Runtime blob that saves all registers,
2821 // and setup oopmap.
2822 //
2823 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
2824   assert(StubRoutines::forward_exception_entry() != NULL,
2825          "must be generated before");
2826 
2827   ResourceMark rm;
2828   OopMapSet *oop_maps = new OopMapSet();
2829   OopMap* map;
2830 
2831   // Allocate space for the code.  Setup code generation tools.
2832   CodeBuffer buffer("handler_blob", 2048, 1024);
2833   MacroAssembler* masm = new MacroAssembler(&buffer);
2834 
2835   address start   = __ pc();
2836   address call_pc = NULL;
2837   int frame_size_in_words;
2838   bool cause_return = (poll_type == POLL_AT_RETURN);
2839   bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
2840 
2841   if (UseRTMLocking) {
2842     // Abort RTM transaction before calling runtime
2843     // because critical section will be large and will be
2844     // aborted anyway. Also nmethod could be deoptimized.
2845     __ xabort(0);
2846   }
2847 
2848   // Make room for return address (or push it again)
2849   if (!cause_return) {
2850     __ push(rbx);
2851   }
2852 
2853   // Save registers, fpu state, and flags
2854   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
2855 
2856   // The following is basically a call_VM.  However, we need the precise
2857   // address of the call in order to generate an oopmap. Hence, we do all the
2858   // work outselves.
2859 
2860   __ set_last_Java_frame(noreg, noreg, NULL);
2861 
2862   // The return address must always be correct so that frame constructor never
2863   // sees an invalid pc.
2864 
2865   if (!cause_return) {
2866     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2867     // Additionally, rbx is a callee saved register and we can look at it later to determine
2868     // if someone changed the return address for us!
2869     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
2870     __ movptr(Address(rbp, wordSize), rbx);
2871   }
2872 
2873   // Do the call
2874   __ mov(c_rarg0, r15_thread);
2875   __ call(RuntimeAddress(call_ptr));
2876 
2877   // Set an oopmap for the call site.  This oopmap will map all
2878   // oop-registers and debug-info registers as callee-saved.  This
2879   // will allow deoptimization at this safepoint to find all possible
2880   // debug-info recordings, as well as let GC find all oops.
2881 
2882   oop_maps->add_gc_map( __ pc() - start, map);
2883 
2884   Label noException;
2885 
2886   __ reset_last_Java_frame(false);
2887 
2888   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
2889   __ jcc(Assembler::equal, noException);
2890 
2891   // Exception pending
2892 
2893   RegisterSaver::restore_live_registers(masm, save_vectors);
2894 
2895   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2896 
2897   // No exception case
2898   __ bind(noException);
2899 
2900   Label no_adjust;
2901 #ifdef ASSERT
2902   Label bail;
2903 #endif
2904   if (!cause_return) {
2905     Label no_prefix, not_special;
2906 
2907     // If our stashed return pc was modified by the runtime we avoid touching it
2908     __ cmpptr(rbx, Address(rbp, wordSize));
2909     __ jccb(Assembler::notEqual, no_adjust);
2910 
2911     // Skip over the poll instruction.
2912     // See NativeInstruction::is_safepoint_poll()
2913     // Possible encodings:
2914     //      85 00       test   %eax,(%rax)
2915     //      85 01       test   %eax,(%rcx)
2916     //      85 02       test   %eax,(%rdx)
2917     //      85 03       test   %eax,(%rbx)
2918     //      85 06       test   %eax,(%rsi)
2919     //      85 07       test   %eax,(%rdi)
2920     //
2921     //   41 85 00       test   %eax,(%r8)
2922     //   41 85 01       test   %eax,(%r9)
2923     //   41 85 02       test   %eax,(%r10)
2924     //   41 85 03       test   %eax,(%r11)
2925     //   41 85 06       test   %eax,(%r14)
2926     //   41 85 07       test   %eax,(%r15)
2927     //
2928     //      85 04 24    test   %eax,(%rsp)
2929     //   41 85 04 24    test   %eax,(%r12)
2930     //      85 45 00    test   %eax,0x0(%rbp)
2931     //   41 85 45 00    test   %eax,0x0(%r13)
2932 
2933     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
2934     __ jcc(Assembler::notEqual, no_prefix);
2935     __ addptr(rbx, 1);
2936     __ bind(no_prefix);
2937 #ifdef ASSERT
2938     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
2939 #endif
2940     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
2941     // r12/rsp 0x04
2942     // r13/rbp 0x05
2943     __ movzbq(rcx, Address(rbx, 1));
2944     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
2945     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
2946     __ cmpptr(rcx, 1);
2947     __ jcc(Assembler::above, not_special);
2948     __ addptr(rbx, 1);
2949     __ bind(not_special);
2950 #ifdef ASSERT
2951     // Verify the correct encoding of the poll we're about to skip.
2952     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
2953     __ jcc(Assembler::notEqual, bail);
2954     // Mask out the modrm bits
2955     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
2956     // rax encodes to 0, so if the bits are nonzero it's incorrect
2957     __ jcc(Assembler::notZero, bail);
2958 #endif
2959     // Adjust return pc forward to step over the safepoint poll instruction
2960     __ addptr(rbx, 2);
2961     __ movptr(Address(rbp, wordSize), rbx);
2962   }
2963 
2964   __ bind(no_adjust);
2965   // Normal exit, restore registers and exit.
2966   RegisterSaver::restore_live_registers(masm, save_vectors);
2967   __ ret(0);
2968 
2969 #ifdef ASSERT
2970   __ bind(bail);
2971   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
2972 #endif
2973 
2974   // Make sure all code is generated
2975   masm->flush();
2976 
2977   // Fill-out other meta info
2978   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
2979 }
2980 
2981 //
2982 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
2983 //
2984 // Generate a stub that calls into vm to find out the proper destination
2985 // of a java call. All the argument registers are live at this point
2986 // but since this is generic code we don't know what they are and the caller
2987 // must do any gc of the args.
2988 //
2989 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
2990   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
2991 
2992   // allocate space for the code
2993   ResourceMark rm;
2994 
2995   CodeBuffer buffer(name, 1000, 512);
2996   MacroAssembler* masm                = new MacroAssembler(&buffer);
2997 
2998   int frame_size_in_words;
2999 
3000   OopMapSet *oop_maps = new OopMapSet();
3001   OopMap* map = NULL;
3002 
3003   int start = __ offset();
3004 
3005   // No need to save vector registers since they are caller-saved anyway.
3006   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
3007 


3008   int frame_complete = __ offset();
3009 
3010   __ set_last_Java_frame(noreg, noreg, NULL);
3011 
3012   __ mov(c_rarg0, r15_thread);
3013 
3014   __ call(RuntimeAddress(destination));
3015 
3016 
3017   // Set an oopmap for the call site.
3018   // We need this not only for callee-saved registers, but also for volatile
3019   // registers that the compiler might be keeping live across a safepoint.
3020 
3021   oop_maps->add_gc_map( __ offset() - start, map);
3022 
3023   // rax contains the address we are going to jump to assuming no exception got installed
3024 
3025   // clear last_Java_sp
3026   __ reset_last_Java_frame(false);
3027   // check for pending exceptions
3028   Label pending;
3029   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3030   __ jcc(Assembler::notEqual, pending);
3031 
3032   // get the returned Method*
3033   __ get_vm_result_2(rbx, r15_thread);
3034   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3035 
3036   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3037 
3038   RegisterSaver::restore_live_registers(masm);
3039 
3040   // We are back the the original state on entry and ready to go.
3041 
3042   __ jmp(rax);
3043 
3044   // Pending exception after the safepoint
3045 
3046   __ bind(pending);
3047 
3048   RegisterSaver::restore_live_registers(masm);
3049 
3050   // exception pending => remove activation and forward to exception handler
3051 
3052   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3053 
3054   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3055   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3056 
3057   // -------------
3058   // make sure all code is generated
3059   masm->flush();
3060 
3061   // return the  blob
3062   // frame_size_words or bytes??
3063   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3064 }
3065 
3066 #ifdef COMPILER2
3067 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3068 
3069 class NativeInvokerGenerator : public StubCodeGenerator {
3070   address _call_target;
3071   int _shadow_space_bytes;
3072 
3073   const GrowableArray<VMReg>& _input_registers;
3074   const GrowableArray<VMReg>& _output_registers;
3075 
3076   int _frame_complete;
3077   int _framesize;
3078   OopMapSet* _oop_maps;
3079 public:
3080   NativeInvokerGenerator(CodeBuffer* buffer,
3081                          address call_target,
3082                          int shadow_space_bytes,
3083                          const GrowableArray<VMReg>& input_registers,
3084                          const GrowableArray<VMReg>& output_registers)
3085    : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3086      _call_target(call_target),
3087      _shadow_space_bytes(shadow_space_bytes),
3088      _input_registers(input_registers),
3089      _output_registers(output_registers),
3090      _frame_complete(0),
3091      _framesize(0),
3092      _oop_maps(NULL) {
3093     assert(_output_registers.length() <= 1
3094            || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3095 
3096   }
3097 
3098   void generate();
3099 
3100   int spill_size_in_bytes() const {
3101     if (_output_registers.length() == 0) {
3102       return 0;
3103     }
3104     VMReg reg = _output_registers.at(0);
3105     assert(reg->is_reg(), "must be a register");
3106     if (reg->is_Register()) {
3107       return 8;
3108     } else if (reg->is_XMMRegister()) {
3109       if (UseAVX >= 3) {
3110         return 64;
3111       } else if (UseAVX >= 1) {
3112         return 32;
3113       } else {
3114         return 16;
3115       }
3116     } else {
3117       ShouldNotReachHere();
3118     }
3119     return 0;
3120   }
3121 
3122   void spill_out_registers() {
3123     if (_output_registers.length() == 0) {
3124       return;
3125     }
3126     VMReg reg = _output_registers.at(0);
3127     assert(reg->is_reg(), "must be a register");
3128     MacroAssembler* masm = _masm;
3129     if (reg->is_Register()) {
3130       __ movptr(Address(rsp, 0), reg->as_Register());
3131     } else if (reg->is_XMMRegister()) {
3132       if (UseAVX >= 3) {
3133         __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3134       } else if (UseAVX >= 1) {
3135         __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3136       } else {
3137         __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3138       }
3139     } else {
3140       ShouldNotReachHere();
3141     }
3142   }
3143 
3144   void fill_out_registers() {
3145     if (_output_registers.length() == 0) {
3146       return;
3147     }
3148     VMReg reg = _output_registers.at(0);
3149     assert(reg->is_reg(), "must be a register");
3150     MacroAssembler* masm = _masm;
3151     if (reg->is_Register()) {
3152       __ movptr(reg->as_Register(), Address(rsp, 0));
3153     } else if (reg->is_XMMRegister()) {
3154       if (UseAVX >= 3) {
3155         __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3156       } else if (UseAVX >= 1) {
3157         __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3158       } else {
3159         __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3160       }
3161     } else {
3162       ShouldNotReachHere();
3163     }
3164   }
3165 
3166   int frame_complete() const {
3167     return _frame_complete;
3168   }
3169 
3170   int framesize() const {
3171     return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3172   }
3173 
3174   OopMapSet* oop_maps() const {
3175     return _oop_maps;
3176   }
3177 
3178 private:
3179 #ifdef ASSERT
3180 bool target_uses_register(VMReg reg) {
3181   return _input_registers.contains(reg) || _output_registers.contains(reg);
3182 }
3183 #endif
3184 };
3185 
3186 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3187                                                 int shadow_space_bytes,
3188                                                 const GrowableArray<VMReg>& input_registers,
3189                                                 const GrowableArray<VMReg>& output_registers) {
3190   int locs_size  = 64;
3191   CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3192   NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3193   g.generate();
3194   code.log_section_sizes("nep_invoker_blob");
3195 
3196   RuntimeStub* stub =
3197     RuntimeStub::new_runtime_stub("nep_invoker_blob",
3198                                   &code,
3199                                   g.frame_complete(),
3200                                   g.framesize(),
3201                                   g.oop_maps(), false);
3202   return stub;
3203 }
3204 
3205 void NativeInvokerGenerator::generate() {
3206   assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3207 
3208   enum layout {
3209     rbp_off,
3210     rbp_off2,
3211     return_off,
3212     return_off2,
3213     framesize // inclusive of return address
3214   };
3215 
3216   _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3217   assert(is_even(_framesize/2), "sp not 16-byte aligned");
3218 
3219   _oop_maps  = new OopMapSet();
3220   MacroAssembler* masm = _masm;
3221 
3222   address start = __ pc();
3223 
3224   __ enter();
3225 
3226   // return address and rbp are already in place
3227   __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3228 
3229   _frame_complete = __ pc() - start;
3230 
3231   address the_pc = __ pc();
3232 
3233   __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3234   OopMap* map = new OopMap(_framesize, 0);
3235   _oop_maps->add_gc_map(the_pc - start, map);
3236 
3237   // State transition
3238   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3239 
3240   __ call(RuntimeAddress(_call_target));
3241 
3242   __ restore_cpu_control_state_after_jni();
3243 
3244   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3245 
3246   // Force this write out before the read below
3247   __ membar(Assembler::Membar_mask_bits(
3248           Assembler::LoadLoad | Assembler::LoadStore |
3249           Assembler::StoreLoad | Assembler::StoreStore));
3250 
3251   Label L_after_safepoint_poll;
3252   Label L_safepoint_poll_slow_path;
3253 
3254   __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3255   __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3256   __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3257 
3258   __ bind(L_after_safepoint_poll);
3259 
3260   // change thread state
3261   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3262 
3263   __ block_comment("reguard stack check");
3264   Label L_reguard;
3265   Label L_after_reguard;
3266   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3267   __ jcc(Assembler::equal, L_reguard);
3268   __ bind(L_after_reguard);
3269 
3270   __ reset_last_Java_frame(r15_thread, true);
3271 
3272   __ leave(); // required for proper stackwalking of RuntimeStub frame
3273   __ ret(0);
3274 
3275   //////////////////////////////////////////////////////////////////////////////
3276 
3277   __ block_comment("{ L_safepoint_poll_slow_path");
3278   __ bind(L_safepoint_poll_slow_path);
3279   __ vzeroupper();
3280 
3281   spill_out_registers();
3282 
3283   __ mov(c_rarg0, r15_thread);
3284   __ mov(r12, rsp); // remember sp
3285   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3286   __ andptr(rsp, -16); // align stack as required by ABI
3287   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3288   __ mov(rsp, r12); // restore sp
3289   __ reinit_heapbase();
3290 
3291   fill_out_registers();
3292 
3293   __ jmp(L_after_safepoint_poll);
3294   __ block_comment("} L_safepoint_poll_slow_path");
3295 
3296   //////////////////////////////////////////////////////////////////////////////
3297 
3298   __ block_comment("{ L_reguard");
3299   __ bind(L_reguard);
3300   __ vzeroupper();
3301 
3302   spill_out_registers();
3303 
3304   __ mov(r12, rsp); // remember sp
3305   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3306   __ andptr(rsp, -16); // align stack as required by ABI
3307   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3308   __ mov(rsp, r12); // restore sp
3309   __ reinit_heapbase();
3310 
3311   fill_out_registers();
3312 
3313   __ jmp(L_after_reguard);
3314 
3315   __ block_comment("} L_reguard");
3316 
3317   //////////////////////////////////////////////////////////////////////////////
3318 
3319   __ flush();
3320 }
3321 #endif // COMPILER2
3322 
3323 //------------------------------Montgomery multiplication------------------------
3324 //
3325 
3326 #ifndef _WINDOWS
3327 
3328 // Subtract 0:b from carry:a.  Return carry.
3329 static julong
3330 sub(julong a[], julong b[], julong carry, long len) {
3331   long long i = 0, cnt = len;
3332   julong tmp;
3333   asm volatile("clc; "
3334                "0: ; "
3335                "mov (%[b], %[i], 8), %[tmp]; "
3336                "sbb %[tmp], (%[a], %[i], 8); "
3337                "inc %[i]; dec %[cnt]; "
3338                "jne 0b; "
3339                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3340                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3341                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3342                : "memory");
3343   return tmp;
3344 }
3345 
3346 // Multiply (unsigned) Long A by Long B, accumulating the double-
3347 // length result into the accumulator formed of T0, T1, and T2.
3348 #define MACC(A, B, T0, T1, T2)                                  \
3349 do {                                                            \
3350   unsigned long hi, lo;                                         \
3351   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3352            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3353            : "r"(A), "a"(B) : "cc");                            \
3354  } while(0)
3355 
3356 // As above, but add twice the double-length result into the
3357 // accumulator.
3358 #define MACC2(A, B, T0, T1, T2)                                 \
3359 do {                                                            \
3360   unsigned long hi, lo;                                         \
3361   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3362            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3363            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3364            : "r"(A), "a"(B) : "cc");                            \
3365  } while(0)
3366 
3367 #else //_WINDOWS
3368 
3369 static julong
3370 sub(julong a[], julong b[], julong carry, long len) {
3371   long i;
3372   julong tmp;
3373   unsigned char c = 1;
3374   for (i = 0; i < len; i++) {
3375     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3376     a[i] = tmp;
3377   }
3378   c = _addcarry_u64(c, carry, ~0, &tmp);
3379   return tmp;
3380 }
3381 
3382 // Multiply (unsigned) Long A by Long B, accumulating the double-
3383 // length result into the accumulator formed of T0, T1, and T2.
3384 #define MACC(A, B, T0, T1, T2)                          \
3385 do {                                                    \
3386   julong hi, lo;                            \
3387   lo = _umul128(A, B, &hi);                             \
3388   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3389   c = _addcarry_u64(c, hi, T1, &T1);                    \
3390   _addcarry_u64(c, T2, 0, &T2);                         \
3391  } while(0)
3392 
3393 // As above, but add twice the double-length result into the
3394 // accumulator.
3395 #define MACC2(A, B, T0, T1, T2)                         \
3396 do {                                                    \
3397   julong hi, lo;                            \
3398   lo = _umul128(A, B, &hi);                             \
3399   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3400   c = _addcarry_u64(c, hi, T1, &T1);                    \
3401   _addcarry_u64(c, T2, 0, &T2);                         \
3402   c = _addcarry_u64(0, lo, T0, &T0);                    \
3403   c = _addcarry_u64(c, hi, T1, &T1);                    \
3404   _addcarry_u64(c, T2, 0, &T2);                         \
3405  } while(0)
3406 
3407 #endif //_WINDOWS
3408 
3409 // Fast Montgomery multiplication.  The derivation of the algorithm is
3410 // in  A Cryptographic Library for the Motorola DSP56000,
3411 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3412 
3413 static void NOINLINE
3414 montgomery_multiply(julong a[], julong b[], julong n[],
3415                     julong m[], julong inv, int len) {
3416   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3417   int i;
3418 
3419   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3420 
3421   for (i = 0; i < len; i++) {
3422     int j;
3423     for (j = 0; j < i; j++) {
3424       MACC(a[j], b[i-j], t0, t1, t2);
3425       MACC(m[j], n[i-j], t0, t1, t2);
3426     }
3427     MACC(a[i], b[0], t0, t1, t2);
3428     m[i] = t0 * inv;
3429     MACC(m[i], n[0], t0, t1, t2);
3430 
3431     assert(t0 == 0, "broken Montgomery multiply");
3432 
3433     t0 = t1; t1 = t2; t2 = 0;
3434   }
3435 
3436   for (i = len; i < 2*len; i++) {
3437     int j;
3438     for (j = i-len+1; j < len; j++) {
3439       MACC(a[j], b[i-j], t0, t1, t2);
3440       MACC(m[j], n[i-j], t0, t1, t2);
3441     }
3442     m[i-len] = t0;
3443     t0 = t1; t1 = t2; t2 = 0;
3444   }
3445 
3446   while (t0)
3447     t0 = sub(m, n, t0, len);
3448 }
3449 
3450 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3451 // multiplies so it should be up to 25% faster than Montgomery
3452 // multiplication.  However, its loop control is more complex and it
3453 // may actually run slower on some machines.
3454 
3455 static void NOINLINE
3456 montgomery_square(julong a[], julong n[],
3457                   julong m[], julong inv, int len) {
3458   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3459   int i;
3460 
3461   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3462 
3463   for (i = 0; i < len; i++) {
3464     int j;
3465     int end = (i+1)/2;
3466     for (j = 0; j < end; j++) {
3467       MACC2(a[j], a[i-j], t0, t1, t2);
3468       MACC(m[j], n[i-j], t0, t1, t2);
3469     }
3470     if ((i & 1) == 0) {
3471       MACC(a[j], a[j], t0, t1, t2);
3472     }
3473     for (; j < i; j++) {
3474       MACC(m[j], n[i-j], t0, t1, t2);
3475     }
3476     m[i] = t0 * inv;
3477     MACC(m[i], n[0], t0, t1, t2);
3478 
3479     assert(t0 == 0, "broken Montgomery square");
3480 
3481     t0 = t1; t1 = t2; t2 = 0;
3482   }
3483 
3484   for (i = len; i < 2*len; i++) {
3485     int start = i-len+1;
3486     int end = start + (len - start)/2;
3487     int j;
3488     for (j = start; j < end; j++) {
3489       MACC2(a[j], a[i-j], t0, t1, t2);
3490       MACC(m[j], n[i-j], t0, t1, t2);
3491     }
3492     if ((i & 1) == 0) {
3493       MACC(a[j], a[j], t0, t1, t2);
3494     }
3495     for (; j < len; j++) {
3496       MACC(m[j], n[i-j], t0, t1, t2);
3497     }
3498     m[i-len] = t0;
3499     t0 = t1; t1 = t2; t2 = 0;
3500   }
3501 
3502   while (t0)
3503     t0 = sub(m, n, t0, len);
3504 }
3505 
3506 // Swap words in a longword.
3507 static julong swap(julong x) {
3508   return (x << 32) | (x >> 32);
3509 }
3510 
3511 // Copy len longwords from s to d, word-swapping as we go.  The
3512 // destination array is reversed.
3513 static void reverse_words(julong *s, julong *d, int len) {
3514   d += len;
3515   while(len-- > 0) {
3516     d--;
3517     *d = swap(*s);
3518     s++;
3519   }
3520 }
3521 
3522 // The threshold at which squaring is advantageous was determined
3523 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3524 #define MONTGOMERY_SQUARING_THRESHOLD 64
3525 
3526 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3527                                         jint len, jlong inv,
3528                                         jint *m_ints) {
3529   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3530   int longwords = len/2;
3531 
3532   // Make very sure we don't use so much space that the stack might
3533   // overflow.  512 jints corresponds to an 16384-bit integer and
3534   // will use here a total of 8k bytes of stack space.
3535   int total_allocation = longwords * sizeof (julong) * 4;
3536   guarantee(total_allocation <= 8192, "must be");
3537   julong *scratch = (julong *)alloca(total_allocation);
3538 
3539   // Local scratch arrays
3540   julong
3541     *a = scratch + 0 * longwords,
3542     *b = scratch + 1 * longwords,
3543     *n = scratch + 2 * longwords,
3544     *m = scratch + 3 * longwords;
3545 
3546   reverse_words((julong *)a_ints, a, longwords);
3547   reverse_words((julong *)b_ints, b, longwords);
3548   reverse_words((julong *)n_ints, n, longwords);
3549 
3550   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3551 
3552   reverse_words(m, (julong *)m_ints, longwords);
3553 }
3554 
3555 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3556                                       jint len, jlong inv,
3557                                       jint *m_ints) {
3558   assert(len % 2 == 0, "array length in montgomery_square must be even");
3559   int longwords = len/2;
3560 
3561   // Make very sure we don't use so much space that the stack might
3562   // overflow.  512 jints corresponds to an 16384-bit integer and
3563   // will use here a total of 6k bytes of stack space.
3564   int total_allocation = longwords * sizeof (julong) * 3;
3565   guarantee(total_allocation <= 8192, "must be");
3566   julong *scratch = (julong *)alloca(total_allocation);
3567 
3568   // Local scratch arrays
3569   julong
3570     *a = scratch + 0 * longwords,
3571     *n = scratch + 1 * longwords,
3572     *m = scratch + 2 * longwords;
3573 
3574   reverse_words((julong *)a_ints, a, longwords);
3575   reverse_words((julong *)n_ints, n, longwords);
3576 
3577   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3578     ::montgomery_square(a, n, m, (julong)inv, longwords);
3579   } else {
3580     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3581   }
3582 
3583   reverse_words(m, (julong *)m_ints, longwords);
3584 }
3585 
3586 #ifdef COMPILER2
3587 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3588 //
3589 //------------------------------generate_exception_blob---------------------------
3590 // creates exception blob at the end
3591 // Using exception blob, this code is jumped from a compiled method.
3592 // (see emit_exception_handler in x86_64.ad file)
3593 //
3594 // Given an exception pc at a call we call into the runtime for the
3595 // handler in this method. This handler might merely restore state
3596 // (i.e. callee save registers) unwind the frame and jump to the
3597 // exception handler for the nmethod if there is no Java level handler
3598 // for the nmethod.
3599 //
3600 // This code is entered with a jmp.
3601 //
3602 // Arguments:
3603 //   rax: exception oop
3604 //   rdx: exception pc
3605 //
3606 // Results:
3607 //   rax: exception oop
3608 //   rdx: exception pc in caller or ???
3609 //   destination: exception handler of caller
3610 //
3611 // Note: the exception pc MUST be at a call (precise debug information)
3612 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3613 //
3614 
3615 void OptoRuntime::generate_exception_blob() {
3616   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3617   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3618   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3619 
3620   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3621 
3622   // Allocate space for the code
3623   ResourceMark rm;
3624   // Setup code generation tools
3625   CodeBuffer buffer("exception_blob", 2048, 1024);
3626   MacroAssembler* masm = new MacroAssembler(&buffer);
3627 
3628 
3629   address start = __ pc();
3630 
3631   // Exception pc is 'return address' for stack walker
3632   __ push(rdx);
3633   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3634 
3635   // Save callee-saved registers.  See x86_64.ad.
3636 
3637   // rbp is an implicitly saved callee saved register (i.e., the calling
3638   // convention will save/restore it in the prolog/epilog). Other than that
3639   // there are no callee save registers now that adapter frames are gone.
3640 
3641   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3642 
3643   // Store exception in Thread object. We cannot pass any arguments to the
3644   // handle_exception call, since we do not want to make any assumption
3645   // about the size of the frame where the exception happened in.
3646   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3647   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3648   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3649 
3650   // This call does all the hard work.  It checks if an exception handler
3651   // exists in the method.
3652   // If so, it returns the handler address.
3653   // If not, it prepares for stack-unwinding, restoring the callee-save
3654   // registers of the frame being removed.
3655   //
3656   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3657 
3658   // At a method handle call, the stack may not be properly aligned
3659   // when returning with an exception.
3660   address the_pc = __ pc();
3661   __ set_last_Java_frame(noreg, noreg, the_pc);
3662   __ mov(c_rarg0, r15_thread);
3663   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3664   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3665 
3666   // Set an oopmap for the call site.  This oopmap will only be used if we
3667   // are unwinding the stack.  Hence, all locations will be dead.
3668   // Callee-saved registers will be the same as the frame above (i.e.,
3669   // handle_exception_stub), since they were restored when we got the
3670   // exception.
3671 
3672   OopMapSet* oop_maps = new OopMapSet();
3673 
3674   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3675 
3676   __ reset_last_Java_frame(false);
3677 
3678   // Restore callee-saved registers
3679 
3680   // rbp is an implicitly saved callee-saved register (i.e., the calling
3681   // convention will save restore it in prolog/epilog) Other than that
3682   // there are no callee save registers now that adapter frames are gone.
3683 
3684   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3685 
3686   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3687   __ pop(rdx);                  // No need for exception pc anymore
3688 
3689   // rax: exception handler
3690 
3691   // We have a handler in rax (could be deopt blob).
3692   __ mov(r8, rax);
3693 
3694   // Get the exception oop
3695   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3696   // Get the exception pc in case we are deoptimized
3697   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3698 #ifdef ASSERT
3699   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
3700   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
3701 #endif
3702   // Clear the exception oop so GC no longer processes it as a root.
3703   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
3704 
3705   // rax: exception oop
3706   // r8:  exception handler
3707   // rdx: exception pc
3708   // Jump to handler
3709 
3710   __ jmp(r8);
3711 
3712   // Make sure all code is generated
3713   masm->flush();
3714 
3715   // Set exception blob
3716   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3717 }
3718 #endif // COMPILER2
3719 
3720 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
3721                                        int total_in_args, const VMRegPair* in_regs,
3722                                        int total_out_args, VMRegPair* out_regs,
3723                                        GrowableArray<int>& arg_order,
3724                                        VMRegPair tmp_vmreg) {
3725   ComputeMoveOrder order(total_in_args, in_regs,
3726                          total_out_args, out_regs,
3727                          in_sig_bt, arg_order, tmp_vmreg);
3728 }
--- EOF ---