1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "classfile/symbolTable.hpp"
  32 #include "code/compiledIC.hpp"
  33 #include "code/debugInfoRec.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "oops/method.inline.hpp"
  47 #include "prims/methodHandles.hpp"
  48 #include "runtime/continuation.hpp"
  49 #include "runtime/continuationEntry.inline.hpp"
  50 #include "runtime/globals.hpp"
  51 #include "runtime/jniHandles.hpp"
  52 #include "runtime/safepointMechanism.hpp"
  53 #include "runtime/sharedRuntime.hpp"
  54 #include "runtime/signature.hpp"
  55 #include "runtime/stubRoutines.hpp"
  56 #include "runtime/vframeArray.hpp"
  57 #include "runtime/vm_version.hpp"
  58 #include "utilities/align.hpp"
  59 #include "utilities/checkedCast.hpp"
  60 #include "utilities/formatBuffer.hpp"
  61 #include "vmreg_x86.inline.hpp"
  62 #ifdef COMPILER1
  63 #include "c1/c1_Runtime1.hpp"
  64 #endif
  65 #ifdef COMPILER2
  66 #include "opto/runtime.hpp"
  67 #endif
  68 #if INCLUDE_JVMCI
  69 #include "jvmci/jvmciJavaClasses.hpp"
  70 #endif
  71 
  72 #define __ masm->
  73 
  74 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  75 
  76 class SimpleRuntimeFrame {
  77 
  78   public:
  79 
  80   // Most of the runtime stubs have this simple frame layout.
  81   // This class exists to make the layout shared in one place.
  82   // Offsets are for compiler stack slots, which are jints.
  83   enum layout {
  84     // The frame sender code expects that rbp will be in the "natural" place and
  85     // will override any oopMap setting for it. We must therefore force the layout
  86     // so that it agrees with the frame sender code.
  87     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  88     rbp_off2,
  89     return_off, return_off2,
  90     framesize
  91   };
  92 };
  93 
  94 class RegisterSaver {
  95   // Capture info about frame layout.  Layout offsets are in jint
  96   // units because compiler frame slots are jints.
  97 #define XSAVE_AREA_BEGIN 160
  98 #define XSAVE_AREA_YMM_BEGIN 576
  99 #define XSAVE_AREA_OPMASK_BEGIN 1088
 100 #define XSAVE_AREA_ZMM_BEGIN 1152
 101 #define XSAVE_AREA_UPPERBANK 1664
 102 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 103 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 104 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 105 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 107   enum layout {
 108     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 109     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 110     DEF_XMM_OFFS(0),
 111     DEF_XMM_OFFS(1),
 112     // 2..15 are implied in range usage
 113     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 114     DEF_YMM_OFFS(0),
 115     DEF_YMM_OFFS(1),
 116     // 2..15 are implied in range usage
 117     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 118     DEF_OPMASK_OFFS(0),
 119     DEF_OPMASK_OFFS(1),
 120     // 2..7 are implied in range usage
 121     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 122     DEF_ZMM_OFFS(0),
 123     DEF_ZMM_OFFS(1),
 124     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 125     DEF_ZMM_UPPER_OFFS(16),
 126     DEF_ZMM_UPPER_OFFS(17),
 127     // 18..31 are implied in range usage
 128     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 129     fpu_stateH_end,
 130     r15_off, r15H_off,
 131     r14_off, r14H_off,
 132     r13_off, r13H_off,
 133     r12_off, r12H_off,
 134     r11_off, r11H_off,
 135     r10_off, r10H_off,
 136     r9_off,  r9H_off,
 137     r8_off,  r8H_off,
 138     rdi_off, rdiH_off,
 139     rsi_off, rsiH_off,
 140     ignore_off, ignoreH_off,  // extra copy of rbp
 141     rsp_off, rspH_off,
 142     rbx_off, rbxH_off,
 143     rdx_off, rdxH_off,
 144     rcx_off, rcxH_off,
 145     rax_off, raxH_off,
 146     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 147     align_off, alignH_off,
 148     flags_off, flagsH_off,
 149     // The frame sender code expects that rbp will be in the "natural" place and
 150     // will override any oopMap setting for it. We must therefore force the layout
 151     // so that it agrees with the frame sender code.
 152     rbp_off, rbpH_off,        // copy of rbp we will restore
 153     return_off, returnH_off,  // slot for return address
 154     reg_save_size             // size in compiler stack slots
 155   };
 156 
 157  public:
 158   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 159   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 160 
 161   // Offsets into the register save area
 162   // Used by deoptimization when it is managing result register
 163   // values on its own
 164 
 165   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 166   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 167   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 168   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 169   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 170 
 171   // During deoptimization only the result registers need to be restored,
 172   // all the other values have already been extracted.
 173   static void restore_result_registers(MacroAssembler* masm);
 174 };
 175 
 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 177   int off = 0;
 178   int num_xmm_regs = XMMRegister::available_xmm_registers();
 179 #if COMPILER2_OR_JVMCI
 180   if (save_wide_vectors && UseAVX == 0) {
 181     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 182   }
 183   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 184 #else
 185   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 186 #endif
 187 
 188   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 189   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 190   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 191   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 192   // CodeBlob frame size is in words.
 193   int frame_size_in_words = frame_size_in_bytes / wordSize;
 194   *total_frame_words = frame_size_in_words;
 195 
 196   // Save registers, fpu state, and flags.
 197   // We assume caller has already pushed the return address onto the
 198   // stack, so rsp is 8-byte aligned here.
 199   // We push rpb twice in this sequence because we want the real rbp
 200   // to be under the return like a normal enter.
 201 
 202   __ enter();          // rsp becomes 16-byte aligned here
 203   __ push_CPU_state(); // Push a multiple of 16 bytes
 204 
 205   // push cpu state handles this on EVEX enabled targets
 206   if (save_wide_vectors) {
 207     // Save upper half of YMM registers(0..15)
 208     int base_addr = XSAVE_AREA_YMM_BEGIN;
 209     for (int n = 0; n < 16; n++) {
 210       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 211     }
 212     if (VM_Version::supports_evex()) {
 213       // Save upper half of ZMM registers(0..15)
 214       base_addr = XSAVE_AREA_ZMM_BEGIN;
 215       for (int n = 0; n < 16; n++) {
 216         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 217       }
 218       // Save full ZMM registers(16..num_xmm_regs)
 219       base_addr = XSAVE_AREA_UPPERBANK;
 220       off = 0;
 221       int vector_len = Assembler::AVX_512bit;
 222       for (int n = 16; n < num_xmm_regs; n++) {
 223         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 224       }
 225 #if COMPILER2_OR_JVMCI
 226       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 227       off = 0;
 228       for(int n = 0; n < KRegister::number_of_registers; n++) {
 229         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 230       }
 231 #endif
 232     }
 233   } else {
 234     if (VM_Version::supports_evex()) {
 235       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 236       int base_addr = XSAVE_AREA_UPPERBANK;
 237       off = 0;
 238       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 239       for (int n = 16; n < num_xmm_regs; n++) {
 240         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 241       }
 242 #if COMPILER2_OR_JVMCI
 243       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 244       off = 0;
 245       for(int n = 0; n < KRegister::number_of_registers; n++) {
 246         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 247       }
 248 #endif
 249     }
 250   }
 251   __ vzeroupper();
 252   if (frame::arg_reg_save_area_bytes != 0) {
 253     // Allocate argument register save area
 254     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 255   }
 256 
 257   // Set an oopmap for the call site.  This oopmap will map all
 258   // oop-registers and debug-info registers as callee-saved.  This
 259   // will allow deoptimization at this safepoint to find all possible
 260   // debug-info recordings, as well as let GC find all oops.
 261 
 262   OopMapSet *oop_maps = new OopMapSet();
 263   OopMap* map = new OopMap(frame_size_in_slots, 0);
 264 
 265 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 266 
 267   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 271   // rbp location is known implicitly by the frame sender code, needs no oopmap
 272   // and the location where rbp was saved by is ignored
 273   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 283   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 284   // on EVEX enabled targets, we get it included in the xsave area
 285   off = xmm0_off;
 286   int delta = xmm1_off - off;
 287   for (int n = 0; n < 16; n++) {
 288     XMMRegister xmm_name = as_XMMRegister(n);
 289     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 290     off += delta;
 291   }
 292   if (UseAVX > 2) {
 293     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 294     off = zmm16_off;
 295     delta = zmm17_off - off;
 296     for (int n = 16; n < num_xmm_regs; n++) {
 297       XMMRegister zmm_name = as_XMMRegister(n);
 298       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 299       off += delta;
 300     }
 301   }
 302 
 303 #if COMPILER2_OR_JVMCI
 304   if (save_wide_vectors) {
 305     // Save upper half of YMM registers(0..15)
 306     off = ymm0_off;
 307     delta = ymm1_off - ymm0_off;
 308     for (int n = 0; n < 16; n++) {
 309       XMMRegister ymm_name = as_XMMRegister(n);
 310       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 311       off += delta;
 312     }
 313     if (VM_Version::supports_evex()) {
 314       // Save upper half of ZMM registers(0..15)
 315       off = zmm0_off;
 316       delta = zmm1_off - zmm0_off;
 317       for (int n = 0; n < 16; n++) {
 318         XMMRegister zmm_name = as_XMMRegister(n);
 319         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 320         off += delta;
 321       }
 322     }
 323   }
 324 #endif // COMPILER2_OR_JVMCI
 325 
 326   // %%% These should all be a waste but we'll keep things as they were for now
 327   if (true) {
 328     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 332     // rbp location is known implicitly by the frame sender code, needs no oopmap
 333     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 343     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 344     // on EVEX enabled targets, we get it included in the xsave area
 345     off = xmm0H_off;
 346     delta = xmm1H_off - off;
 347     for (int n = 0; n < 16; n++) {
 348       XMMRegister xmm_name = as_XMMRegister(n);
 349       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 350       off += delta;
 351     }
 352     if (UseAVX > 2) {
 353       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 354       off = zmm16H_off;
 355       delta = zmm17H_off - off;
 356       for (int n = 16; n < num_xmm_regs; n++) {
 357         XMMRegister zmm_name = as_XMMRegister(n);
 358         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 359         off += delta;
 360       }
 361     }
 362   }
 363 
 364   return map;
 365 }
 366 
 367 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 368   int num_xmm_regs = XMMRegister::available_xmm_registers();
 369   if (frame::arg_reg_save_area_bytes != 0) {
 370     // Pop arg register save area
 371     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 372   }
 373 
 374 #if COMPILER2_OR_JVMCI
 375   if (restore_wide_vectors) {
 376     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 377     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 378   }
 379 #else
 380   assert(!restore_wide_vectors, "vectors are generated only by C2");
 381 #endif
 382 
 383   __ vzeroupper();
 384 
 385   // On EVEX enabled targets everything is handled in pop fpu state
 386   if (restore_wide_vectors) {
 387     // Restore upper half of YMM registers (0..15)
 388     int base_addr = XSAVE_AREA_YMM_BEGIN;
 389     for (int n = 0; n < 16; n++) {
 390       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 391     }
 392     if (VM_Version::supports_evex()) {
 393       // Restore upper half of ZMM registers (0..15)
 394       base_addr = XSAVE_AREA_ZMM_BEGIN;
 395       for (int n = 0; n < 16; n++) {
 396         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 397       }
 398       // Restore full ZMM registers(16..num_xmm_regs)
 399       base_addr = XSAVE_AREA_UPPERBANK;
 400       int vector_len = Assembler::AVX_512bit;
 401       int off = 0;
 402       for (int n = 16; n < num_xmm_regs; n++) {
 403         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 404       }
 405 #if COMPILER2_OR_JVMCI
 406       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 407       off = 0;
 408       for (int n = 0; n < KRegister::number_of_registers; n++) {
 409         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 410       }
 411 #endif
 412     }
 413   } else {
 414     if (VM_Version::supports_evex()) {
 415       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 416       int base_addr = XSAVE_AREA_UPPERBANK;
 417       int off = 0;
 418       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 419       for (int n = 16; n < num_xmm_regs; n++) {
 420         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 421       }
 422 #if COMPILER2_OR_JVMCI
 423       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 424       off = 0;
 425       for (int n = 0; n < KRegister::number_of_registers; n++) {
 426         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 427       }
 428 #endif
 429     }
 430   }
 431 
 432   // Recover CPU state
 433   __ pop_CPU_state();
 434   // Get the rbp described implicitly by the calling convention (no oopMap)
 435   __ pop(rbp);
 436 }
 437 
 438 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 439 
 440   // Just restore result register. Only used by deoptimization. By
 441   // now any callee save register that needs to be restored to a c2
 442   // caller of the deoptee has been extracted into the vframeArray
 443   // and will be stuffed into the c2i adapter we create for later
 444   // restoration so only result registers need to be restored here.
 445 
 446   // Restore fp result register
 447   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 448   // Restore integer result register
 449   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 450   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 451 
 452   // Pop all of the register save are off the stack except the return address
 453   __ addptr(rsp, return_offset_in_bytes());
 454 }
 455 
 456 // Is vector's size (in bytes) bigger than a size saved by default?
 457 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 458 bool SharedRuntime::is_wide_vector(int size) {
 459   return size > 16;
 460 }
 461 
 462 // ---------------------------------------------------------------------------
 463 // Read the array of BasicTypes from a signature, and compute where the
 464 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 465 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 466 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 467 // as framesizes are fixed.
 468 // VMRegImpl::stack0 refers to the first slot 0(sp).
 469 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 470 // Register up to Register::number_of_registers are the 64-bit
 471 // integer registers.
 472 
 473 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 474 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 475 // units regardless of build. Of course for i486 there is no 64 bit build
 476 
 477 // The Java calling convention is a "shifted" version of the C ABI.
 478 // By skipping the first C ABI register we can call non-static jni methods
 479 // with small numbers of arguments without having to shuffle the arguments
 480 // at all. Since we control the java ABI we ought to at least get some
 481 // advantage out of it.
 482 
 483 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 484                                            VMRegPair *regs,
 485                                            int total_args_passed) {
 486 
 487   // Create the mapping between argument positions and
 488   // registers.
 489   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 490     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 491   };
 492   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 493     j_farg0, j_farg1, j_farg2, j_farg3,
 494     j_farg4, j_farg5, j_farg6, j_farg7
 495   };
 496 
 497 
 498   uint int_args = 0;
 499   uint fp_args = 0;
 500   uint stk_args = 0;
 501 
 502   for (int i = 0; i < total_args_passed; i++) {
 503     switch (sig_bt[i]) {
 504     case T_BOOLEAN:
 505     case T_CHAR:
 506     case T_BYTE:
 507     case T_SHORT:
 508     case T_INT:
 509       if (int_args < Argument::n_int_register_parameters_j) {
 510         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 511       } else {
 512         stk_args = align_up(stk_args, 2);
 513         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 514         stk_args += 1;
 515       }
 516       break;
 517     case T_VOID:
 518       // halves of T_LONG or T_DOUBLE
 519       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 520       regs[i].set_bad();
 521       break;
 522     case T_LONG:
 523       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 524       // fall through
 525     case T_OBJECT:
 526     case T_ARRAY:
 527     case T_ADDRESS:
 528       if (int_args < Argument::n_int_register_parameters_j) {
 529         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 530       } else {
 531         stk_args = align_up(stk_args, 2);
 532         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 533         stk_args += 2;
 534       }
 535       break;
 536     case T_FLOAT:
 537       if (fp_args < Argument::n_float_register_parameters_j) {
 538         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 539       } else {
 540         stk_args = align_up(stk_args, 2);
 541         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 542         stk_args += 1;
 543       }
 544       break;
 545     case T_DOUBLE:
 546       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 547       if (fp_args < Argument::n_float_register_parameters_j) {
 548         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 549       } else {
 550         stk_args = align_up(stk_args, 2);
 551         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 552         stk_args += 2;
 553       }
 554       break;
 555     default:
 556       ShouldNotReachHere();
 557       break;
 558     }
 559   }
 560 
 561   return stk_args;
 562 }
 563 
 564 // Same as java_calling_convention() but for multiple return
 565 // values. There's no way to store them on the stack so if we don't
 566 // have enough registers, multiple values can't be returned.
 567 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 568 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 569 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 570                                           VMRegPair *regs,
 571                                           int total_args_passed) {
 572   // Create the mapping between argument positions and
 573   // registers.
 574   static const Register INT_ArgReg[java_return_convention_max_int] = {
 575     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 576   };
 577   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 578     j_farg0, j_farg1, j_farg2, j_farg3,
 579     j_farg4, j_farg5, j_farg6, j_farg7
 580   };
 581 
 582 
 583   uint int_args = 0;
 584   uint fp_args = 0;
 585 
 586   for (int i = 0; i < total_args_passed; i++) {
 587     switch (sig_bt[i]) {
 588     case T_BOOLEAN:
 589     case T_CHAR:
 590     case T_BYTE:
 591     case T_SHORT:
 592     case T_INT:
 593       if (int_args < Argument::n_int_register_parameters_j+1) {
 594         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 595         int_args++;
 596       } else {
 597         return -1;
 598       }
 599       break;
 600     case T_VOID:
 601       // halves of T_LONG or T_DOUBLE
 602       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 603       regs[i].set_bad();
 604       break;
 605     case T_LONG:
 606       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 607       // fall through
 608     case T_OBJECT:
 609     case T_ARRAY:
 610     case T_ADDRESS:
 611     case T_METADATA:
 612       if (int_args < Argument::n_int_register_parameters_j+1) {
 613         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 614         int_args++;
 615       } else {
 616         return -1;
 617       }
 618       break;
 619     case T_FLOAT:
 620       if (fp_args < Argument::n_float_register_parameters_j) {
 621         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 622         fp_args++;
 623       } else {
 624         return -1;
 625       }
 626       break;
 627     case T_DOUBLE:
 628       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 629       if (fp_args < Argument::n_float_register_parameters_j) {
 630         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 631         fp_args++;
 632       } else {
 633         return -1;
 634       }
 635       break;
 636     default:
 637       ShouldNotReachHere();
 638       break;
 639     }
 640   }
 641 
 642   return int_args + fp_args;
 643 }
 644 
 645 // Patch the callers callsite with entry to compiled code if it exists.
 646 static void patch_callers_callsite(MacroAssembler *masm) {
 647   Label L;
 648   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 649   __ jcc(Assembler::equal, L);
 650 
 651   // Save the current stack pointer
 652   __ mov(r13, rsp);
 653   // Schedule the branch target address early.
 654   // Call into the VM to patch the caller, then jump to compiled callee
 655   // rax isn't live so capture return address while we easily can
 656   __ movptr(rax, Address(rsp, 0));
 657 
 658   // align stack so push_CPU_state doesn't fault
 659   __ andptr(rsp, -(StackAlignmentInBytes));
 660   __ push_CPU_state();
 661   __ vzeroupper();
 662   // VM needs caller's callsite
 663   // VM needs target method
 664   // This needs to be a long call since we will relocate this adapter to
 665   // the codeBuffer and it may not reach
 666 
 667   // Allocate argument register save area
 668   if (frame::arg_reg_save_area_bytes != 0) {
 669     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 670   }
 671   __ mov(c_rarg0, rbx);
 672   __ mov(c_rarg1, rax);
 673   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 674 
 675   // De-allocate argument register save area
 676   if (frame::arg_reg_save_area_bytes != 0) {
 677     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 678   }
 679 
 680   __ vzeroupper();
 681   __ pop_CPU_state();
 682   // restore sp
 683   __ mov(rsp, r13);
 684   __ bind(L);
 685 }
 686 
 687 // For each inline type argument, sig includes the list of fields of
 688 // the inline type. This utility function computes the number of
 689 // arguments for the call if inline types are passed by reference (the
 690 // calling convention the interpreter expects).
 691 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 692   int total_args_passed = 0;
 693   if (InlineTypePassFieldsAsArgs) {
 694     for (int i = 0; i < sig_extended->length(); i++) {
 695       BasicType bt = sig_extended->at(i)._bt;
 696       if (bt == T_METADATA) {
 697         // In sig_extended, an inline type argument starts with:
 698         // T_METADATA, followed by the types of the fields of the
 699         // inline type and T_VOID to mark the end of the value
 700         // type. Inline types are flattened so, for instance, in the
 701         // case of an inline type with an int field and an inline type
 702         // field that itself has 2 fields, an int and a long:
 703         // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
 704         // slot for the T_LONG) T_VOID (inner inline type) T_VOID
 705         // (outer inline type)
 706         total_args_passed++;
 707         int vt = 1;
 708         do {
 709           i++;
 710           BasicType bt = sig_extended->at(i)._bt;
 711           BasicType prev_bt = sig_extended->at(i-1)._bt;
 712           if (bt == T_METADATA) {
 713             vt++;
 714           } else if (bt == T_VOID &&
 715                      prev_bt != T_LONG &&
 716                      prev_bt != T_DOUBLE) {
 717             vt--;
 718           }
 719         } while (vt != 0);
 720       } else {
 721         total_args_passed++;
 722       }
 723     }
 724   } else {
 725     total_args_passed = sig_extended->length();
 726   }
 727   return total_args_passed;
 728 }
 729 
 730 
 731 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 732                                    BasicType bt,
 733                                    BasicType prev_bt,
 734                                    size_t size_in_bytes,
 735                                    const VMRegPair& reg_pair,
 736                                    const Address& to,
 737                                    int extraspace,
 738                                    bool is_oop) {
 739   if (bt == T_VOID) {
 740     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 741     return;
 742   }
 743 
 744   // Say 4 args:
 745   // i   st_off
 746   // 0   32 T_LONG
 747   // 1   24 T_VOID
 748   // 2   16 T_OBJECT
 749   // 3    8 T_BOOL
 750   // -    0 return address
 751   //
 752   // However to make thing extra confusing. Because we can fit a long/double in
 753   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 754   // leaves one slot empty and only stores to a single slot. In this case the
 755   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 756 
 757   bool wide = (size_in_bytes == wordSize);
 758   VMReg r_1 = reg_pair.first();
 759   VMReg r_2 = reg_pair.second();
 760   assert(r_2->is_valid() == wide, "invalid size");
 761   if (!r_1->is_valid()) {
 762     assert(!r_2->is_valid(), "must be invalid");
 763     return;
 764   }
 765 
 766   if (!r_1->is_XMMRegister()) {
 767     Register val = rax;
 768     if (r_1->is_stack()) {
 769       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 770       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 771     } else {
 772       val = r_1->as_Register();
 773     }
 774     assert_different_registers(to.base(), val, rscratch1);
 775     if (is_oop) {
 776       __ push(r13);
 777       __ push(rbx);
 778       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 779       __ pop(rbx);
 780       __ pop(r13);
 781     } else {
 782       __ store_sized_value(to, val, size_in_bytes);
 783     }
 784   } else {
 785     if (wide) {
 786       __ movdbl(to, r_1->as_XMMRegister());
 787     } else {
 788       __ movflt(to, r_1->as_XMMRegister());
 789     }
 790   }
 791 }
 792 
 793 static void gen_c2i_adapter(MacroAssembler *masm,
 794                             const GrowableArray<SigEntry>* sig_extended,
 795                             const VMRegPair *regs,
 796                             bool requires_clinit_barrier,
 797                             address& c2i_no_clinit_check_entry,
 798                             Label& skip_fixup,
 799                             address start,
 800                             OopMapSet* oop_maps,
 801                             int& frame_complete,
 802                             int& frame_size_in_words,
 803                             bool alloc_inline_receiver) {
 804   if (requires_clinit_barrier && VM_Version::supports_fast_class_init_checks()) {
 805     Label L_skip_barrier;
 806     Register method = rbx;
 807 
 808     { // Bypass the barrier for non-static methods
 809       Register flags = rscratch1;
 810       __ movl(flags, Address(method, Method::access_flags_offset()));
 811       __ testl(flags, JVM_ACC_STATIC);
 812       __ jcc(Assembler::zero, L_skip_barrier); // non-static
 813     }
 814 
 815     Register klass = rscratch1;
 816     __ load_method_holder(klass, method);
 817     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
 818 
 819     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
 820 
 821     __ bind(L_skip_barrier);
 822     c2i_no_clinit_check_entry = __ pc();
 823   }
 824 
 825   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 826   bs->c2i_entry_barrier(masm);
 827 
 828   // Before we get into the guts of the C2I adapter, see if we should be here
 829   // at all.  We've come from compiled code and are attempting to jump to the
 830   // interpreter, which means the caller made a static call to get here
 831   // (vcalls always get a compiled target if there is one).  Check for a
 832   // compiled target.  If there is one, we need to patch the caller's call.
 833   patch_callers_callsite(masm);
 834 
 835   __ bind(skip_fixup);
 836 
 837   if (InlineTypePassFieldsAsArgs) {
 838     // Is there an inline type argument?
 839     bool has_inline_argument = false;
 840     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 841       has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
 842     }
 843     if (has_inline_argument) {
 844       // There is at least an inline type argument: we're coming from
 845       // compiled code so we have no buffers to back the inline types.
 846       // Allocate the buffers here with a runtime call.
 847       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
 848 
 849       frame_complete = __ offset();
 850 
 851       __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
 852 
 853       __ mov(c_rarg0, r15_thread);
 854       __ mov(c_rarg1, rbx);
 855       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 856       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 857 
 858       oop_maps->add_gc_map((int)(__ pc() - start), map);
 859       __ reset_last_Java_frame(false);
 860 
 861       RegisterSaver::restore_live_registers(masm);
 862 
 863       Label no_exception;
 864       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 865       __ jcc(Assembler::equal, no_exception);
 866 
 867       __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
 868       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 869       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 870 
 871       __ bind(no_exception);
 872 
 873       // We get an array of objects from the runtime call
 874       __ get_vm_result(rscratch2, r15_thread); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 875       __ get_vm_result_2(rbx, r15_thread); // TODO: required to keep the callee Method live?
 876     }
 877   }
 878 
 879   // Since all args are passed on the stack, total_args_passed *
 880   // Interpreter::stackElementSize is the space we need.
 881   int total_args_passed = compute_total_args_passed_int(sig_extended);
 882   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 883 
 884   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 885 
 886   // stack is aligned, keep it that way
 887   // This is not currently needed or enforced by the interpreter, but
 888   // we might as well conform to the ABI.
 889   extraspace = align_up(extraspace, 2*wordSize);
 890 
 891   // set senderSP value
 892   __ lea(r13, Address(rsp, wordSize));
 893 
 894 #ifdef ASSERT
 895   __ check_stack_alignment(r13, "sender stack not aligned");
 896 #endif
 897   if (extraspace > 0) {
 898     // Pop the return address
 899     __ pop(rax);
 900 
 901     __ subptr(rsp, extraspace);
 902 
 903     // Push the return address
 904     __ push(rax);
 905 
 906     // Account for the return address location since we store it first rather
 907     // than hold it in a register across all the shuffling
 908     extraspace += wordSize;
 909   }
 910 
 911 #ifdef ASSERT
 912   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 913 #endif
 914 
 915   // Now write the args into the outgoing interpreter space
 916 
 917   // next_arg_comp is the next argument from the compiler point of
 918   // view (inline type fields are passed in registers/on the stack). In
 919   // sig_extended, an inline type argument starts with: T_METADATA,
 920   // followed by the types of the fields of the inline type and T_VOID
 921   // to mark the end of the inline type. ignored counts the number of
 922   // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
 923   // used to get the buffer for that argument from the pool of buffers
 924   // we allocated above and want to pass to the
 925   // interpreter. next_arg_int is the next argument from the
 926   // interpreter point of view (inline types are passed by reference).
 927   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
 928        next_arg_comp < sig_extended->length(); next_arg_comp++) {
 929     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
 930     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
 931     BasicType bt = sig_extended->at(next_arg_comp)._bt;
 932     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
 933     if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
 934       int next_off = st_off - Interpreter::stackElementSize;
 935       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
 936       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
 937       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
 938       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
 939                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
 940       next_arg_int++;
 941 #ifdef ASSERT
 942       if (bt == T_LONG || bt == T_DOUBLE) {
 943         // Overwrite the unused slot with known junk
 944         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 945         __ movptr(Address(rsp, st_off), rax);
 946       }
 947 #endif /* ASSERT */
 948     } else {
 949       ignored++;
 950       // get the buffer from the just allocated pool of buffers
 951       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
 952       __ load_heap_oop(r14, Address(rscratch2, index));
 953       next_vt_arg++; next_arg_int++;
 954       int vt = 1;
 955       // write fields we get from compiled code in registers/stack
 956       // slots to the buffer: we know we are done with that inline type
 957       // argument when we hit the T_VOID that acts as an end of inline
 958       // type delimiter for this inline type. Inline types are flattened
 959       // so we might encounter embedded inline types. Each entry in
 960       // sig_extended contains a field offset in the buffer.
 961       Label L_null;
 962       do {
 963         next_arg_comp++;
 964         BasicType bt = sig_extended->at(next_arg_comp)._bt;
 965         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
 966         if (bt == T_METADATA) {
 967           vt++;
 968           ignored++;
 969         } else if (bt == T_VOID &&
 970                    prev_bt != T_LONG &&
 971                    prev_bt != T_DOUBLE) {
 972           vt--;
 973           ignored++;
 974         } else {
 975           int off = sig_extended->at(next_arg_comp)._offset;
 976           if (off == -1) {
 977             // Nullable inline type argument, emit null check
 978             VMReg reg = regs[next_arg_comp-ignored].first();
 979             Label L_notNull;
 980             if (reg->is_stack()) {
 981               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 982               __ testb(Address(rsp, ld_off), 1);
 983             } else {
 984               __ testb(reg->as_Register(), 1);
 985             }
 986             __ jcc(Assembler::notZero, L_notNull);
 987             __ movptr(Address(rsp, st_off), 0);
 988             __ jmp(L_null);
 989             __ bind(L_notNull);
 990             continue;
 991           }
 992           assert(off > 0, "offset in object should be positive");
 993           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
 994           bool is_oop = is_reference_type(bt);
 995           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
 996                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
 997         }
 998       } while (vt != 0);
 999       // pass the buffer to the interpreter
1000       __ movptr(Address(rsp, st_off), r14);
1001       __ bind(L_null);
1002     }
1003   }
1004 
1005   // Schedule the branch target address early.
1006   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1007   __ jmp(rcx);
1008 }
1009 
1010 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
1011                         address code_start, address code_end,
1012                         Label& L_ok) {
1013   Label L_fail;
1014   __ lea(temp_reg, ExternalAddress(code_start));
1015   __ cmpptr(pc_reg, temp_reg);
1016   __ jcc(Assembler::belowEqual, L_fail);
1017   __ lea(temp_reg, ExternalAddress(code_end));
1018   __ cmpptr(pc_reg, temp_reg);
1019   __ jcc(Assembler::below, L_ok);
1020   __ bind(L_fail);
1021 }
1022 
1023 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1024                                     int comp_args_on_stack,
1025                                     const GrowableArray<SigEntry>* sig,
1026                                     const VMRegPair *regs) {
1027 
1028   // Note: r13 contains the senderSP on entry. We must preserve it since
1029   // we may do a i2c -> c2i transition if we lose a race where compiled
1030   // code goes non-entrant while we get args ready.
1031   // In addition we use r13 to locate all the interpreter args as
1032   // we must align the stack to 16 bytes on an i2c entry else we
1033   // lose alignment we expect in all compiled code and register
1034   // save code can segv when fxsave instructions find improperly
1035   // aligned stack pointer.
1036 
1037   // Adapters can be frameless because they do not require the caller
1038   // to perform additional cleanup work, such as correcting the stack pointer.
1039   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1040   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1041   // even if a callee has modified the stack pointer.
1042   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1043   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1044   // up via the senderSP register).
1045   // In other words, if *either* the caller or callee is interpreted, we can
1046   // get the stack pointer repaired after a call.
1047   // This is why c2i and i2c adapters cannot be indefinitely composed.
1048   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1049   // both caller and callee would be compiled methods, and neither would
1050   // clean up the stack pointer changes performed by the two adapters.
1051   // If this happens, control eventually transfers back to the compiled
1052   // caller, but with an uncorrected stack, causing delayed havoc.
1053 
1054   if (VerifyAdapterCalls &&
1055       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
1056     // So, let's test for cascading c2i/i2c adapters right now.
1057     //  assert(Interpreter::contains($return_addr) ||
1058     //         StubRoutines::contains($return_addr),
1059     //         "i2c adapter must return to an interpreter frame");
1060     __ block_comment("verify_i2c { ");
1061     // Pick up the return address
1062     __ movptr(rax, Address(rsp, 0));
1063     Label L_ok;
1064     if (Interpreter::code() != nullptr) {
1065       range_check(masm, rax, r11,
1066                   Interpreter::code()->code_start(),
1067                   Interpreter::code()->code_end(),
1068                   L_ok);
1069     }
1070     if (StubRoutines::initial_stubs_code() != nullptr) {
1071       range_check(masm, rax, r11,
1072                   StubRoutines::initial_stubs_code()->code_begin(),
1073                   StubRoutines::initial_stubs_code()->code_end(),
1074                   L_ok);
1075     }
1076     if (StubRoutines::final_stubs_code() != nullptr) {
1077       range_check(masm, rax, r11,
1078                   StubRoutines::final_stubs_code()->code_begin(),
1079                   StubRoutines::final_stubs_code()->code_end(),
1080                   L_ok);
1081     }
1082     const char* msg = "i2c adapter must return to an interpreter frame";
1083     __ block_comment(msg);
1084     __ stop(msg);
1085     __ bind(L_ok);
1086     __ block_comment("} verify_i2ce ");
1087   }
1088 
1089   // Must preserve original SP for loading incoming arguments because
1090   // we need to align the outgoing SP for compiled code.
1091   __ movptr(r11, rsp);
1092 
1093   // Pick up the return address
1094   __ pop(rax);
1095 
1096   // Convert 4-byte c2 stack slots to words.
1097   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1098 
1099   if (comp_args_on_stack) {
1100     __ subptr(rsp, comp_words_on_stack * wordSize);
1101   }
1102 
1103   // Ensure compiled code always sees stack at proper alignment
1104   __ andptr(rsp, -16);
1105 
1106   // push the return address and misalign the stack that youngest frame always sees
1107   // as far as the placement of the call instruction
1108   __ push(rax);
1109 
1110   // Put saved SP in another register
1111   const Register saved_sp = rax;
1112   __ movptr(saved_sp, r11);
1113 
1114   // Will jump to the compiled code just as if compiled code was doing it.
1115   // Pre-load the register-jump target early, to schedule it better.
1116   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1117 
1118 #if INCLUDE_JVMCI
1119   if (EnableJVMCI) {
1120     // check if this call should be routed towards a specific entry point
1121     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1122     Label no_alternative_target;
1123     __ jcc(Assembler::equal, no_alternative_target);
1124     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1125     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1126     __ bind(no_alternative_target);
1127   }
1128 #endif // INCLUDE_JVMCI
1129 
1130   int total_args_passed = sig->length();
1131 
1132   // Now generate the shuffle code.  Pick up all register args and move the
1133   // rest through the floating point stack top.
1134   for (int i = 0; i < total_args_passed; i++) {
1135     BasicType bt = sig->at(i)._bt;
1136     if (bt == T_VOID) {
1137       // Longs and doubles are passed in native word order, but misaligned
1138       // in the 32-bit build.
1139       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1140       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1141       continue;
1142     }
1143 
1144     // Pick up 0, 1 or 2 words from SP+offset.
1145 
1146     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1147             "scrambled load targets?");
1148     // Load in argument order going down.
1149     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1150     // Point to interpreter value (vs. tag)
1151     int next_off = ld_off - Interpreter::stackElementSize;
1152     //
1153     //
1154     //
1155     VMReg r_1 = regs[i].first();
1156     VMReg r_2 = regs[i].second();
1157     if (!r_1->is_valid()) {
1158       assert(!r_2->is_valid(), "");
1159       continue;
1160     }
1161     if (r_1->is_stack()) {
1162       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1163       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1164 
1165       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1166       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1167       // will be generated.
1168       if (!r_2->is_valid()) {
1169         // sign extend???
1170         __ movl(r13, Address(saved_sp, ld_off));
1171         __ movptr(Address(rsp, st_off), r13);
1172       } else {
1173         //
1174         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1175         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1176         // So we must adjust where to pick up the data to match the interpreter.
1177         //
1178         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1179         // are accessed as negative so LSW is at LOW address
1180 
1181         // ld_off is MSW so get LSW
1182         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1183                            next_off : ld_off;
1184         __ movq(r13, Address(saved_sp, offset));
1185         // st_off is LSW (i.e. reg.first())
1186         __ movq(Address(rsp, st_off), r13);
1187       }
1188     } else if (r_1->is_Register()) {  // Register argument
1189       Register r = r_1->as_Register();
1190       assert(r != rax, "must be different");
1191       if (r_2->is_valid()) {
1192         //
1193         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1194         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1195         // So we must adjust where to pick up the data to match the interpreter.
1196 
1197         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1198                            next_off : ld_off;
1199 
1200         // this can be a misaligned move
1201         __ movq(r, Address(saved_sp, offset));
1202       } else {
1203         // sign extend and use a full word?
1204         __ movl(r, Address(saved_sp, ld_off));
1205       }
1206     } else {
1207       if (!r_2->is_valid()) {
1208         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1209       } else {
1210         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1211       }
1212     }
1213   }
1214 
1215   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1216 
1217   // 6243940 We might end up in handle_wrong_method if
1218   // the callee is deoptimized as we race thru here. If that
1219   // happens we don't want to take a safepoint because the
1220   // caller frame will look interpreted and arguments are now
1221   // "compiled" so it is much better to make this transition
1222   // invisible to the stack walking code. Unfortunately if
1223   // we try and find the callee by normal means a safepoint
1224   // is possible. So we stash the desired callee in the thread
1225   // and the vm will find there should this case occur.
1226 
1227   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1228 
1229   // put Method* where a c2i would expect should we end up there
1230   // only needed because of c2 resolve stubs return Method* as a result in
1231   // rax
1232   __ mov(rax, rbx);
1233   __ jmp(r11);
1234 }
1235 
1236 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1237   Register data = rax;
1238   __ ic_check(1 /* end_alignment */);
1239   __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1240 
1241   // Method might have been compiled since the call site was patched to
1242   // interpreted if that is the case treat it as a miss so we can get
1243   // the call site corrected.
1244   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1245   __ jcc(Assembler::equal, skip_fixup);
1246   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1247 }
1248 
1249 // ---------------------------------------------------------------
1250 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1251                                                             int comp_args_on_stack,
1252                                                             const GrowableArray<SigEntry>* sig,
1253                                                             const VMRegPair* regs,
1254                                                             const GrowableArray<SigEntry>* sig_cc,
1255                                                             const VMRegPair* regs_cc,
1256                                                             const GrowableArray<SigEntry>* sig_cc_ro,
1257                                                             const VMRegPair* regs_cc_ro,
1258                                                             AdapterFingerPrint* fingerprint,
1259                                                             AdapterBlob*& new_adapter,
1260                                                             bool allocate_code_blob) {
1261   address i2c_entry = __ pc();
1262   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1263 
1264   // -------------------------------------------------------------------------
1265   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1266   // to the interpreter.  The args start out packed in the compiled layout.  They
1267   // need to be unpacked into the interpreter layout.  This will almost always
1268   // require some stack space.  We grow the current (compiled) stack, then repack
1269   // the args.  We  finally end in a jump to the generic interpreter entry point.
1270   // On exit from the interpreter, the interpreter will restore our SP (lest the
1271   // compiled code, which relies solely on SP and not RBP, get sick).
1272 
1273   address c2i_unverified_entry        = __ pc();
1274   address c2i_unverified_inline_entry = __ pc();
1275   Label skip_fixup;
1276 
1277   gen_inline_cache_check(masm, skip_fixup);
1278 
1279   OopMapSet* oop_maps = new OopMapSet();
1280   int frame_complete = CodeOffsets::frame_never_safe;
1281   int frame_size_in_words = 0;
1282 
1283   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1284   address c2i_no_clinit_check_entry = nullptr;
1285   address c2i_inline_ro_entry = __ pc();
1286   if (regs_cc != regs_cc_ro) {
1287     // No class init barrier needed because method is guaranteed to be non-static
1288     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, c2i_no_clinit_check_entry,
1289                     skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1290     skip_fixup.reset();
1291   }
1292 
1293   // Scalarized c2i adapter
1294   address c2i_entry        = __ pc();
1295   address c2i_inline_entry = __ pc();
1296   gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1297                   skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1298 
1299   // Non-scalarized c2i adapter
1300   if (regs != regs_cc) {
1301     c2i_unverified_inline_entry = __ pc();
1302     Label inline_entry_skip_fixup;
1303     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1304 
1305     c2i_inline_entry = __ pc();
1306     gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1307                     inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1308   }
1309 
1310   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1311   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1312   if (allocate_code_blob) {
1313     bool caller_must_gc_arguments = (regs != regs_cc);
1314     new_adapter = AdapterBlob::create(masm->code(), frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1315   }
1316 
1317   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry, c2i_unverified_inline_entry, c2i_no_clinit_check_entry);
1318 }
1319 
1320 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1321                                          VMRegPair *regs,
1322                                          int total_args_passed) {
1323 
1324 // We return the amount of VMRegImpl stack slots we need to reserve for all
1325 // the arguments NOT counting out_preserve_stack_slots.
1326 
1327 // NOTE: These arrays will have to change when c1 is ported
1328 #ifdef _WIN64
1329     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1330       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1331     };
1332     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1333       c_farg0, c_farg1, c_farg2, c_farg3
1334     };
1335 #else
1336     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1337       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1338     };
1339     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1340       c_farg0, c_farg1, c_farg2, c_farg3,
1341       c_farg4, c_farg5, c_farg6, c_farg7
1342     };
1343 #endif // _WIN64
1344 
1345 
1346     uint int_args = 0;
1347     uint fp_args = 0;
1348     uint stk_args = 0; // inc by 2 each time
1349 
1350     for (int i = 0; i < total_args_passed; i++) {
1351       switch (sig_bt[i]) {
1352       case T_BOOLEAN:
1353       case T_CHAR:
1354       case T_BYTE:
1355       case T_SHORT:
1356       case T_INT:
1357         if (int_args < Argument::n_int_register_parameters_c) {
1358           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1359 #ifdef _WIN64
1360           fp_args++;
1361           // Allocate slots for callee to stuff register args the stack.
1362           stk_args += 2;
1363 #endif
1364         } else {
1365           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1366           stk_args += 2;
1367         }
1368         break;
1369       case T_LONG:
1370         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1371         // fall through
1372       case T_OBJECT:
1373       case T_ARRAY:
1374       case T_ADDRESS:
1375       case T_METADATA:
1376         if (int_args < Argument::n_int_register_parameters_c) {
1377           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1378 #ifdef _WIN64
1379           fp_args++;
1380           stk_args += 2;
1381 #endif
1382         } else {
1383           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1384           stk_args += 2;
1385         }
1386         break;
1387       case T_FLOAT:
1388         if (fp_args < Argument::n_float_register_parameters_c) {
1389           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1390 #ifdef _WIN64
1391           int_args++;
1392           // Allocate slots for callee to stuff register args the stack.
1393           stk_args += 2;
1394 #endif
1395         } else {
1396           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1397           stk_args += 2;
1398         }
1399         break;
1400       case T_DOUBLE:
1401         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1402         if (fp_args < Argument::n_float_register_parameters_c) {
1403           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1404 #ifdef _WIN64
1405           int_args++;
1406           // Allocate slots for callee to stuff register args the stack.
1407           stk_args += 2;
1408 #endif
1409         } else {
1410           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1411           stk_args += 2;
1412         }
1413         break;
1414       case T_VOID: // Halves of longs and doubles
1415         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1416         regs[i].set_bad();
1417         break;
1418       default:
1419         ShouldNotReachHere();
1420         break;
1421       }
1422     }
1423 #ifdef _WIN64
1424   // windows abi requires that we always allocate enough stack space
1425   // for 4 64bit registers to be stored down.
1426   if (stk_args < 8) {
1427     stk_args = 8;
1428   }
1429 #endif // _WIN64
1430 
1431   return stk_args;
1432 }
1433 
1434 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1435                                              uint num_bits,
1436                                              uint total_args_passed) {
1437   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1438          "only certain vector sizes are supported for now");
1439 
1440   static const XMMRegister VEC_ArgReg[32] = {
1441      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1442      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1443     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1444     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1445   };
1446 
1447   uint stk_args = 0;
1448   uint fp_args = 0;
1449 
1450   for (uint i = 0; i < total_args_passed; i++) {
1451     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1452     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1453     regs[i].set_pair(vmreg->next(next_val), vmreg);
1454   }
1455 
1456   return stk_args;
1457 }
1458 
1459 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1460   // We always ignore the frame_slots arg and just use the space just below frame pointer
1461   // which by this time is free to use
1462   switch (ret_type) {
1463   case T_FLOAT:
1464     __ movflt(Address(rbp, -wordSize), xmm0);
1465     break;
1466   case T_DOUBLE:
1467     __ movdbl(Address(rbp, -wordSize), xmm0);
1468     break;
1469   case T_VOID:  break;
1470   default: {
1471     __ movptr(Address(rbp, -wordSize), rax);
1472     }
1473   }
1474 }
1475 
1476 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1477   // We always ignore the frame_slots arg and just use the space just below frame pointer
1478   // which by this time is free to use
1479   switch (ret_type) {
1480   case T_FLOAT:
1481     __ movflt(xmm0, Address(rbp, -wordSize));
1482     break;
1483   case T_DOUBLE:
1484     __ movdbl(xmm0, Address(rbp, -wordSize));
1485     break;
1486   case T_VOID:  break;
1487   default: {
1488     __ movptr(rax, Address(rbp, -wordSize));
1489     }
1490   }
1491 }
1492 
1493 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1494     for ( int i = first_arg ; i < arg_count ; i++ ) {
1495       if (args[i].first()->is_Register()) {
1496         __ push(args[i].first()->as_Register());
1497       } else if (args[i].first()->is_XMMRegister()) {
1498         __ subptr(rsp, 2*wordSize);
1499         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1500       }
1501     }
1502 }
1503 
1504 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1505     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1506       if (args[i].first()->is_Register()) {
1507         __ pop(args[i].first()->as_Register());
1508       } else if (args[i].first()->is_XMMRegister()) {
1509         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1510         __ addptr(rsp, 2*wordSize);
1511       }
1512     }
1513 }
1514 
1515 static void verify_oop_args(MacroAssembler* masm,
1516                             const methodHandle& method,
1517                             const BasicType* sig_bt,
1518                             const VMRegPair* regs) {
1519   Register temp_reg = rbx;  // not part of any compiled calling seq
1520   if (VerifyOops) {
1521     for (int i = 0; i < method->size_of_parameters(); i++) {
1522       if (is_reference_type(sig_bt[i])) {
1523         VMReg r = regs[i].first();
1524         assert(r->is_valid(), "bad oop arg");
1525         if (r->is_stack()) {
1526           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1527           __ verify_oop(temp_reg);
1528         } else {
1529           __ verify_oop(r->as_Register());
1530         }
1531       }
1532     }
1533   }
1534 }
1535 
1536 static void check_continuation_enter_argument(VMReg actual_vmreg,
1537                                               Register expected_reg,
1538                                               const char* name) {
1539   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1540   assert(actual_vmreg->as_Register() == expected_reg,
1541          "%s is in unexpected register: %s instead of %s",
1542          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1543 }
1544 
1545 
1546 //---------------------------- continuation_enter_setup ---------------------------
1547 //
1548 // Arguments:
1549 //   None.
1550 //
1551 // Results:
1552 //   rsp: pointer to blank ContinuationEntry
1553 //
1554 // Kills:
1555 //   rax
1556 //
1557 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1558   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1559   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1560   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1561 
1562   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1563   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1564 
1565   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1566   OopMap* map = new OopMap(frame_size, 0);
1567 
1568   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1569   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1570   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1571 
1572   return map;
1573 }
1574 
1575 //---------------------------- fill_continuation_entry ---------------------------
1576 //
1577 // Arguments:
1578 //   rsp: pointer to blank Continuation entry
1579 //   reg_cont_obj: pointer to the continuation
1580 //   reg_flags: flags
1581 //
1582 // Results:
1583 //   rsp: pointer to filled out ContinuationEntry
1584 //
1585 // Kills:
1586 //   rax
1587 //
1588 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1589   assert_different_registers(rax, reg_cont_obj, reg_flags);
1590 #ifdef ASSERT
1591   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1592 #endif
1593   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1594   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1595   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1596   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1597   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1598 
1599   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1600   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1601   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1602   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1603 
1604   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1605   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1606 }
1607 
1608 //---------------------------- continuation_enter_cleanup ---------------------------
1609 //
1610 // Arguments:
1611 //   rsp: pointer to the ContinuationEntry
1612 //
1613 // Results:
1614 //   rsp: pointer to the spilled rbp in the entry frame
1615 //
1616 // Kills:
1617 //   rbx
1618 //
1619 void static continuation_enter_cleanup(MacroAssembler* masm) {
1620 #ifdef ASSERT
1621   Label L_good_sp;
1622   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1623   __ jcc(Assembler::equal, L_good_sp);
1624   __ stop("Incorrect rsp at continuation_enter_cleanup");
1625   __ bind(L_good_sp);
1626 #endif
1627 
1628   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1629   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1630   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1631   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1632 
1633   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1634   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1635   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1636 }
1637 
1638 static void gen_continuation_enter(MacroAssembler* masm,
1639                                    const VMRegPair* regs,
1640                                    int& exception_offset,
1641                                    OopMapSet* oop_maps,
1642                                    int& frame_complete,
1643                                    int& stack_slots,
1644                                    int& interpreted_entry_offset,
1645                                    int& compiled_entry_offset) {
1646 
1647   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1648   int pos_cont_obj   = 0;
1649   int pos_is_cont    = 1;
1650   int pos_is_virtual = 2;
1651 
1652   // The platform-specific calling convention may present the arguments in various registers.
1653   // To simplify the rest of the code, we expect the arguments to reside at these known
1654   // registers, and we additionally check the placement here in case calling convention ever
1655   // changes.
1656   Register reg_cont_obj   = c_rarg1;
1657   Register reg_is_cont    = c_rarg2;
1658   Register reg_is_virtual = c_rarg3;
1659 
1660   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1661   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1662   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1663 
1664   // Utility methods kill rax, make sure there are no collisions
1665   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1666 
1667   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1668                          relocInfo::static_call_type);
1669 
1670   address start = __ pc();
1671 
1672   Label L_thaw, L_exit;
1673 
1674   // i2i entry used at interp_only_mode only
1675   interpreted_entry_offset = __ pc() - start;
1676   {
1677 #ifdef ASSERT
1678     Label is_interp_only;
1679     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1680     __ jcc(Assembler::notEqual, is_interp_only);
1681     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1682     __ bind(is_interp_only);
1683 #endif
1684 
1685     __ pop(rax); // return address
1686     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1687     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1688     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1689     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1690     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1691     __ push(rax); // return address
1692     __ push_cont_fastpath();
1693 
1694     __ enter();
1695 
1696     stack_slots = 2; // will be adjusted in setup
1697     OopMap* map = continuation_enter_setup(masm, stack_slots);
1698     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1699     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1700 
1701     __ verify_oop(reg_cont_obj);
1702 
1703     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1704 
1705     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1706     __ testptr(reg_is_cont, reg_is_cont);
1707     __ jcc(Assembler::notZero, L_thaw);
1708 
1709     // --- Resolve path
1710 
1711     // Make sure the call is patchable
1712     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1713     // Emit stub for static call
1714     CodeBuffer* cbuf = masm->code_section()->outer();
1715     address stub = CompiledDirectCall::emit_to_interp_stub(*cbuf, __ pc());
1716     if (stub == nullptr) {
1717       fatal("CodeCache is full at gen_continuation_enter");
1718     }
1719     __ call(resolve);
1720     oop_maps->add_gc_map(__ pc() - start, map);
1721     __ post_call_nop();
1722 
1723     __ jmp(L_exit);
1724   }
1725 
1726   // compiled entry
1727   __ align(CodeEntryAlignment);
1728   compiled_entry_offset = __ pc() - start;
1729   __ enter();
1730 
1731   stack_slots = 2; // will be adjusted in setup
1732   OopMap* map = continuation_enter_setup(masm, stack_slots);
1733 
1734   // Frame is now completed as far as size and linkage.
1735   frame_complete = __ pc() - start;
1736 
1737   __ verify_oop(reg_cont_obj);
1738 
1739   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1740 
1741   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1742   __ testptr(reg_is_cont, reg_is_cont);
1743   __ jccb(Assembler::notZero, L_thaw);
1744 
1745   // --- call Continuation.enter(Continuation c, boolean isContinue)
1746 
1747   // Make sure the call is patchable
1748   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1749 
1750   // Emit stub for static call
1751   CodeBuffer* cbuf = masm->code_section()->outer();
1752   address stub = CompiledDirectCall::emit_to_interp_stub(*cbuf, __ pc());
1753   if (stub == nullptr) {
1754     fatal("CodeCache is full at gen_continuation_enter");
1755   }
1756 
1757   // The call needs to be resolved. There's a special case for this in
1758   // SharedRuntime::find_callee_info_helper() which calls
1759   // LinkResolver::resolve_continuation_enter() which resolves the call to
1760   // Continuation.enter(Continuation c, boolean isContinue).
1761   __ call(resolve);
1762 
1763   oop_maps->add_gc_map(__ pc() - start, map);
1764   __ post_call_nop();
1765 
1766   __ jmpb(L_exit);
1767 
1768   // --- Thawing path
1769 
1770   __ bind(L_thaw);
1771 
1772   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1773 
1774   ContinuationEntry::_return_pc_offset = __ pc() - start;
1775   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1776   __ post_call_nop();
1777 
1778   // --- Normal exit (resolve/thawing)
1779 
1780   __ bind(L_exit);
1781 
1782   continuation_enter_cleanup(masm);
1783   __ pop(rbp);
1784   __ ret(0);
1785 
1786   // --- Exception handling path
1787 
1788   exception_offset = __ pc() - start;
1789 
1790   continuation_enter_cleanup(masm);
1791   __ pop(rbp);
1792 
1793   __ movptr(c_rarg0, r15_thread);
1794   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1795 
1796   // rax still holds the original exception oop, save it before the call
1797   __ push(rax);
1798 
1799   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1800   __ movptr(rbx, rax);
1801 
1802   // Continue at exception handler:
1803   //   rax: exception oop
1804   //   rbx: exception handler
1805   //   rdx: exception pc
1806   __ pop(rax);
1807   __ verify_oop(rax);
1808   __ pop(rdx);
1809   __ jmp(rbx);
1810 }
1811 
1812 static void gen_continuation_yield(MacroAssembler* masm,
1813                                    const VMRegPair* regs,
1814                                    OopMapSet* oop_maps,
1815                                    int& frame_complete,
1816                                    int& stack_slots,
1817                                    int& compiled_entry_offset) {
1818   enum layout {
1819     rbp_off,
1820     rbpH_off,
1821     return_off,
1822     return_off2,
1823     framesize // inclusive of return address
1824   };
1825   stack_slots = framesize /  VMRegImpl::slots_per_word;
1826   assert(stack_slots == 2, "recheck layout");
1827 
1828   address start = __ pc();
1829   compiled_entry_offset = __ pc() - start;
1830   __ enter();
1831   address the_pc = __ pc();
1832 
1833   frame_complete = the_pc - start;
1834 
1835   // This nop must be exactly at the PC we push into the frame info.
1836   // We use this nop for fast CodeBlob lookup, associate the OopMap
1837   // with it right away.
1838   __ post_call_nop();
1839   OopMap* map = new OopMap(framesize, 1);
1840   oop_maps->add_gc_map(frame_complete, map);
1841 
1842   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1843   __ movptr(c_rarg0, r15_thread);
1844   __ movptr(c_rarg1, rsp);
1845   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1846   __ reset_last_Java_frame(true);
1847 
1848   Label L_pinned;
1849 
1850   __ testptr(rax, rax);
1851   __ jcc(Assembler::notZero, L_pinned);
1852 
1853   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1854   continuation_enter_cleanup(masm);
1855   __ pop(rbp);
1856   __ ret(0);
1857 
1858   __ bind(L_pinned);
1859 
1860   // Pinned, return to caller
1861 
1862   // handle pending exception thrown by freeze
1863   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1864   Label ok;
1865   __ jcc(Assembler::equal, ok);
1866   __ leave();
1867   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1868   __ bind(ok);
1869 
1870   __ leave();
1871   __ ret(0);
1872 }
1873 
1874 static void gen_special_dispatch(MacroAssembler* masm,
1875                                  const methodHandle& method,
1876                                  const BasicType* sig_bt,
1877                                  const VMRegPair* regs) {
1878   verify_oop_args(masm, method, sig_bt, regs);
1879   vmIntrinsics::ID iid = method->intrinsic_id();
1880 
1881   // Now write the args into the outgoing interpreter space
1882   bool     has_receiver   = false;
1883   Register receiver_reg   = noreg;
1884   int      member_arg_pos = -1;
1885   Register member_reg     = noreg;
1886   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1887   if (ref_kind != 0) {
1888     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1889     member_reg = rbx;  // known to be free at this point
1890     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1891   } else if (iid == vmIntrinsics::_invokeBasic) {
1892     has_receiver = true;
1893   } else if (iid == vmIntrinsics::_linkToNative) {
1894     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1895     member_reg = rbx;  // known to be free at this point
1896   } else {
1897     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1898   }
1899 
1900   if (member_reg != noreg) {
1901     // Load the member_arg into register, if necessary.
1902     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1903     VMReg r = regs[member_arg_pos].first();
1904     if (r->is_stack()) {
1905       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1906     } else {
1907       // no data motion is needed
1908       member_reg = r->as_Register();
1909     }
1910   }
1911 
1912   if (has_receiver) {
1913     // Make sure the receiver is loaded into a register.
1914     assert(method->size_of_parameters() > 0, "oob");
1915     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1916     VMReg r = regs[0].first();
1917     assert(r->is_valid(), "bad receiver arg");
1918     if (r->is_stack()) {
1919       // Porting note:  This assumes that compiled calling conventions always
1920       // pass the receiver oop in a register.  If this is not true on some
1921       // platform, pick a temp and load the receiver from stack.
1922       fatal("receiver always in a register");
1923       receiver_reg = j_rarg0;  // known to be free at this point
1924       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1925     } else {
1926       // no data motion is needed
1927       receiver_reg = r->as_Register();
1928     }
1929   }
1930 
1931   // Figure out which address we are really jumping to:
1932   MethodHandles::generate_method_handle_dispatch(masm, iid,
1933                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1934 }
1935 
1936 // ---------------------------------------------------------------------------
1937 // Generate a native wrapper for a given method.  The method takes arguments
1938 // in the Java compiled code convention, marshals them to the native
1939 // convention (handlizes oops, etc), transitions to native, makes the call,
1940 // returns to java state (possibly blocking), unhandlizes any result and
1941 // returns.
1942 //
1943 // Critical native functions are a shorthand for the use of
1944 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1945 // functions.  The wrapper is expected to unpack the arguments before
1946 // passing them to the callee. Critical native functions leave the state _in_Java,
1947 // since they cannot stop for GC.
1948 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1949 // block and the check for pending exceptions it's impossible for them
1950 // to be thrown.
1951 //
1952 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1953                                                 const methodHandle& method,
1954                                                 int compile_id,
1955                                                 BasicType* in_sig_bt,
1956                                                 VMRegPair* in_regs,
1957                                                 BasicType ret_type) {
1958   if (method->is_continuation_native_intrinsic()) {
1959     int exception_offset = -1;
1960     OopMapSet* oop_maps = new OopMapSet();
1961     int frame_complete = -1;
1962     int stack_slots = -1;
1963     int interpreted_entry_offset = -1;
1964     int vep_offset = -1;
1965     if (method->is_continuation_enter_intrinsic()) {
1966       gen_continuation_enter(masm,
1967                              in_regs,
1968                              exception_offset,
1969                              oop_maps,
1970                              frame_complete,
1971                              stack_slots,
1972                              interpreted_entry_offset,
1973                              vep_offset);
1974     } else if (method->is_continuation_yield_intrinsic()) {
1975       gen_continuation_yield(masm,
1976                              in_regs,
1977                              oop_maps,
1978                              frame_complete,
1979                              stack_slots,
1980                              vep_offset);
1981     } else {
1982       guarantee(false, "Unknown Continuation native intrinsic");
1983     }
1984 
1985 #ifdef ASSERT
1986     if (method->is_continuation_enter_intrinsic()) {
1987       assert(interpreted_entry_offset != -1, "Must be set");
1988       assert(exception_offset != -1,         "Must be set");
1989     } else {
1990       assert(interpreted_entry_offset == -1, "Must be unset");
1991       assert(exception_offset == -1,         "Must be unset");
1992     }
1993     assert(frame_complete != -1,    "Must be set");
1994     assert(stack_slots != -1,       "Must be set");
1995     assert(vep_offset != -1,        "Must be set");
1996 #endif
1997 
1998     __ flush();
1999     nmethod* nm = nmethod::new_native_nmethod(method,
2000                                               compile_id,
2001                                               masm->code(),
2002                                               vep_offset,
2003                                               frame_complete,
2004                                               stack_slots,
2005                                               in_ByteSize(-1),
2006                                               in_ByteSize(-1),
2007                                               oop_maps,
2008                                               exception_offset);
2009     if (nm == nullptr) return nm;
2010     if (method->is_continuation_enter_intrinsic()) {
2011       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2012     } else if (method->is_continuation_yield_intrinsic()) {
2013       _cont_doYield_stub = nm;
2014     }
2015     return nm;
2016   }
2017 
2018   if (method->is_method_handle_intrinsic()) {
2019     vmIntrinsics::ID iid = method->intrinsic_id();
2020     intptr_t start = (intptr_t)__ pc();
2021     int vep_offset = ((intptr_t)__ pc()) - start;
2022     gen_special_dispatch(masm,
2023                          method,
2024                          in_sig_bt,
2025                          in_regs);
2026     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
2027     __ flush();
2028     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
2029     return nmethod::new_native_nmethod(method,
2030                                        compile_id,
2031                                        masm->code(),
2032                                        vep_offset,
2033                                        frame_complete,
2034                                        stack_slots / VMRegImpl::slots_per_word,
2035                                        in_ByteSize(-1),
2036                                        in_ByteSize(-1),
2037                                        nullptr);
2038   }
2039   address native_func = method->native_function();
2040   assert(native_func != nullptr, "must have function");
2041 
2042   // An OopMap for lock (and class if static)
2043   OopMapSet *oop_maps = new OopMapSet();
2044   intptr_t start = (intptr_t)__ pc();
2045 
2046   // We have received a description of where all the java arg are located
2047   // on entry to the wrapper. We need to convert these args to where
2048   // the jni function will expect them. To figure out where they go
2049   // we convert the java signature to a C signature by inserting
2050   // the hidden arguments as arg[0] and possibly arg[1] (static method)
2051 
2052   const int total_in_args = method->size_of_parameters();
2053   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2054 
2055   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2056   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2057   BasicType* in_elem_bt = nullptr;
2058 
2059   int argc = 0;
2060   out_sig_bt[argc++] = T_ADDRESS;
2061   if (method->is_static()) {
2062     out_sig_bt[argc++] = T_OBJECT;
2063   }
2064 
2065   for (int i = 0; i < total_in_args ; i++ ) {
2066     out_sig_bt[argc++] = in_sig_bt[i];
2067   }
2068 
2069   // Now figure out where the args must be stored and how much stack space
2070   // they require.
2071   int out_arg_slots;
2072   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2073 
2074   // Compute framesize for the wrapper.  We need to handlize all oops in
2075   // incoming registers
2076 
2077   // Calculate the total number of stack slots we will need.
2078 
2079   // First count the abi requirement plus all of the outgoing args
2080   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2081 
2082   // Now the space for the inbound oop handle area
2083   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
2084 
2085   int oop_handle_offset = stack_slots;
2086   stack_slots += total_save_slots;
2087 
2088   // Now any space we need for handlizing a klass if static method
2089 
2090   int klass_slot_offset = 0;
2091   int klass_offset = -1;
2092   int lock_slot_offset = 0;
2093   bool is_static = false;
2094 
2095   if (method->is_static()) {
2096     klass_slot_offset = stack_slots;
2097     stack_slots += VMRegImpl::slots_per_word;
2098     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2099     is_static = true;
2100   }
2101 
2102   // Plus a lock if needed
2103 
2104   if (method->is_synchronized()) {
2105     lock_slot_offset = stack_slots;
2106     stack_slots += VMRegImpl::slots_per_word;
2107   }
2108 
2109   // Now a place (+2) to save return values or temp during shuffling
2110   // + 4 for return address (which we own) and saved rbp
2111   stack_slots += 6;
2112 
2113   // Ok The space we have allocated will look like:
2114   //
2115   //
2116   // FP-> |                     |
2117   //      |---------------------|
2118   //      | 2 slots for moves   |
2119   //      |---------------------|
2120   //      | lock box (if sync)  |
2121   //      |---------------------| <- lock_slot_offset
2122   //      | klass (if static)   |
2123   //      |---------------------| <- klass_slot_offset
2124   //      | oopHandle area      |
2125   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2126   //      | outbound memory     |
2127   //      | based arguments     |
2128   //      |                     |
2129   //      |---------------------|
2130   //      |                     |
2131   // SP-> | out_preserved_slots |
2132   //
2133   //
2134 
2135 
2136   // Now compute actual number of stack words we need rounding to make
2137   // stack properly aligned.
2138   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2139 
2140   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2141 
2142   // First thing make an ic check to see if we should even be here
2143 
2144   // We are free to use all registers as temps without saving them and
2145   // restoring them except rbp. rbp is the only callee save register
2146   // as far as the interpreter and the compiler(s) are concerned.
2147 
2148   const Register receiver = j_rarg0;
2149 
2150   Label exception_pending;
2151 
2152   assert_different_registers(receiver, rscratch1, rscratch2);
2153   __ verify_oop(receiver);
2154   __ ic_check(8 /* end_alignment */);
2155 
2156   int vep_offset = ((intptr_t)__ pc()) - start;
2157 
2158   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2159     Label L_skip_barrier;
2160     Register klass = r10;
2161     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2162     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2163 
2164     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2165 
2166     __ bind(L_skip_barrier);
2167   }
2168 
2169 #ifdef COMPILER1
2170   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2171   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2172     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2173   }
2174 #endif // COMPILER1
2175 
2176   // The instruction at the verified entry point must be 5 bytes or longer
2177   // because it can be patched on the fly by make_non_entrant. The stack bang
2178   // instruction fits that requirement.
2179 
2180   // Generate stack overflow check
2181   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2182 
2183   // Generate a new frame for the wrapper.
2184   __ enter();
2185   // -2 because return address is already present and so is saved rbp
2186   __ subptr(rsp, stack_size - 2*wordSize);
2187 
2188   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2189   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2190   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2191 
2192   // Frame is now completed as far as size and linkage.
2193   int frame_complete = ((intptr_t)__ pc()) - start;
2194 
2195     if (UseRTMLocking) {
2196       // Abort RTM transaction before calling JNI
2197       // because critical section will be large and will be
2198       // aborted anyway. Also nmethod could be deoptimized.
2199       __ xabort(0);
2200     }
2201 
2202 #ifdef ASSERT
2203   __ check_stack_alignment(rsp, "improperly aligned stack");
2204 #endif /* ASSERT */
2205 
2206 
2207   // We use r14 as the oop handle for the receiver/klass
2208   // It is callee save so it survives the call to native
2209 
2210   const Register oop_handle_reg = r14;
2211 
2212   //
2213   // We immediately shuffle the arguments so that any vm call we have to
2214   // make from here on out (sync slow path, jvmti, etc.) we will have
2215   // captured the oops from our caller and have a valid oopMap for
2216   // them.
2217 
2218   // -----------------
2219   // The Grand Shuffle
2220 
2221   // The Java calling convention is either equal (linux) or denser (win64) than the
2222   // c calling convention. However the because of the jni_env argument the c calling
2223   // convention always has at least one more (and two for static) arguments than Java.
2224   // Therefore if we move the args from java -> c backwards then we will never have
2225   // a register->register conflict and we don't have to build a dependency graph
2226   // and figure out how to break any cycles.
2227   //
2228 
2229   // Record esp-based slot for receiver on stack for non-static methods
2230   int receiver_offset = -1;
2231 
2232   // This is a trick. We double the stack slots so we can claim
2233   // the oops in the caller's frame. Since we are sure to have
2234   // more args than the caller doubling is enough to make
2235   // sure we can capture all the incoming oop args from the
2236   // caller.
2237   //
2238   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2239 
2240   // Mark location of rbp (someday)
2241   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2242 
2243   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2244   // All inbound args are referenced based on rbp and all outbound args via rsp.
2245 
2246 
2247 #ifdef ASSERT
2248   bool reg_destroyed[Register::number_of_registers];
2249   bool freg_destroyed[XMMRegister::number_of_registers];
2250   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2251     reg_destroyed[r] = false;
2252   }
2253   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2254     freg_destroyed[f] = false;
2255   }
2256 
2257 #endif /* ASSERT */
2258 
2259   // For JNI natives the incoming and outgoing registers are offset upwards.
2260   GrowableArray<int> arg_order(2 * total_in_args);
2261 
2262   VMRegPair tmp_vmreg;
2263   tmp_vmreg.set2(rbx->as_VMReg());
2264 
2265   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2266     arg_order.push(i);
2267     arg_order.push(c_arg);
2268   }
2269 
2270   int temploc = -1;
2271   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2272     int i = arg_order.at(ai);
2273     int c_arg = arg_order.at(ai + 1);
2274     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2275 #ifdef ASSERT
2276     if (in_regs[i].first()->is_Register()) {
2277       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2278     } else if (in_regs[i].first()->is_XMMRegister()) {
2279       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2280     }
2281     if (out_regs[c_arg].first()->is_Register()) {
2282       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2283     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2284       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2285     }
2286 #endif /* ASSERT */
2287     switch (in_sig_bt[i]) {
2288       case T_ARRAY:
2289       case T_OBJECT:
2290         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2291                     ((i == 0) && (!is_static)),
2292                     &receiver_offset);
2293         break;
2294       case T_VOID:
2295         break;
2296 
2297       case T_FLOAT:
2298         __ float_move(in_regs[i], out_regs[c_arg]);
2299           break;
2300 
2301       case T_DOUBLE:
2302         assert( i + 1 < total_in_args &&
2303                 in_sig_bt[i + 1] == T_VOID &&
2304                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2305         __ double_move(in_regs[i], out_regs[c_arg]);
2306         break;
2307 
2308       case T_LONG :
2309         __ long_move(in_regs[i], out_regs[c_arg]);
2310         break;
2311 
2312       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2313 
2314       default:
2315         __ move32_64(in_regs[i], out_regs[c_arg]);
2316     }
2317   }
2318 
2319   int c_arg;
2320 
2321   // Pre-load a static method's oop into r14.  Used both by locking code and
2322   // the normal JNI call code.
2323   // point c_arg at the first arg that is already loaded in case we
2324   // need to spill before we call out
2325   c_arg = total_c_args - total_in_args;
2326 
2327   if (method->is_static()) {
2328 
2329     //  load oop into a register
2330     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2331 
2332     // Now handlize the static class mirror it's known not-null.
2333     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2334     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2335 
2336     // Now get the handle
2337     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2338     // store the klass handle as second argument
2339     __ movptr(c_rarg1, oop_handle_reg);
2340     // and protect the arg if we must spill
2341     c_arg--;
2342   }
2343 
2344   // Change state to native (we save the return address in the thread, since it might not
2345   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2346   // points into the right code segment. It does not have to be the correct return pc.
2347   // We use the same pc/oopMap repeatedly when we call out
2348 
2349   intptr_t the_pc = (intptr_t) __ pc();
2350   oop_maps->add_gc_map(the_pc - start, map);
2351 
2352   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2353 
2354 
2355   // We have all of the arguments setup at this point. We must not touch any register
2356   // argument registers at this point (what if we save/restore them there are no oop?
2357 
2358   {
2359     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2360     // protect the args we've loaded
2361     save_args(masm, total_c_args, c_arg, out_regs);
2362     __ mov_metadata(c_rarg1, method());
2363     __ call_VM_leaf(
2364       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2365       r15_thread, c_rarg1);
2366     restore_args(masm, total_c_args, c_arg, out_regs);
2367   }
2368 
2369   // RedefineClasses() tracing support for obsolete method entry
2370   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2371     // protect the args we've loaded
2372     save_args(masm, total_c_args, c_arg, out_regs);
2373     __ mov_metadata(c_rarg1, method());
2374     __ call_VM_leaf(
2375       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2376       r15_thread, c_rarg1);
2377     restore_args(masm, total_c_args, c_arg, out_regs);
2378   }
2379 
2380   // Lock a synchronized method
2381 
2382   // Register definitions used by locking and unlocking
2383 
2384   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2385   const Register obj_reg  = rbx;  // Will contain the oop
2386   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2387   const Register old_hdr  = r13;  // value of old header at unlock time
2388 
2389   Label slow_path_lock;
2390   Label lock_done;
2391 
2392   if (method->is_synchronized()) {
2393     Label count_mon;
2394 
2395     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2396 
2397     // Get the handle (the 2nd argument)
2398     __ mov(oop_handle_reg, c_rarg1);
2399 
2400     // Get address of the box
2401 
2402     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2403 
2404     // Load the oop from the handle
2405     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2406 
2407     if (LockingMode == LM_MONITOR) {
2408       __ jmp(slow_path_lock);
2409     } else if (LockingMode == LM_LEGACY) {
2410       // Load immediate 1 into swap_reg %rax
2411       __ movl(swap_reg, 1);
2412 
2413       // Load (object->mark() | 1) into swap_reg %rax
2414       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2415       if (EnableValhalla) {
2416         // Mask inline_type bit such that we go to the slow path if object is an inline type
2417         __ andptr(swap_reg, ~((int) markWord::inline_type_bit_in_place));
2418       }
2419 
2420       // Save (object->mark() | 1) into BasicLock's displaced header
2421       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2422 
2423       // src -> dest iff dest == rax else rax <- dest
2424       __ lock();
2425       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2426       __ jcc(Assembler::equal, count_mon);
2427 
2428       // Hmm should this move to the slow path code area???
2429 
2430       // Test if the oopMark is an obvious stack pointer, i.e.,
2431       //  1) (mark & 3) == 0, and
2432       //  2) rsp <= mark < mark + os::pagesize()
2433       // These 3 tests can be done by evaluating the following
2434       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2435       // assuming both stack pointer and pagesize have their
2436       // least significant 2 bits clear.
2437       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2438 
2439       __ subptr(swap_reg, rsp);
2440       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2441 
2442       // Save the test result, for recursive case, the result is zero
2443       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2444       __ jcc(Assembler::notEqual, slow_path_lock);
2445     } else {
2446       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2447       __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2448     }
2449     __ bind(count_mon);
2450     __ inc_held_monitor_count();
2451 
2452     // Slow path will re-enter here
2453     __ bind(lock_done);
2454   }
2455 
2456   // Finally just about ready to make the JNI call
2457 
2458   // get JNIEnv* which is first argument to native
2459   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2460 
2461   // Now set thread in native
2462   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2463 
2464   __ call(RuntimeAddress(native_func));
2465 
2466   // Verify or restore cpu control state after JNI call
2467   __ restore_cpu_control_state_after_jni(rscratch1);
2468 
2469   // Unpack native results.
2470   switch (ret_type) {
2471   case T_BOOLEAN: __ c2bool(rax);            break;
2472   case T_CHAR   : __ movzwl(rax, rax);      break;
2473   case T_BYTE   : __ sign_extend_byte (rax); break;
2474   case T_SHORT  : __ sign_extend_short(rax); break;
2475   case T_INT    : /* nothing to do */        break;
2476   case T_DOUBLE :
2477   case T_FLOAT  :
2478     // Result is in xmm0 we'll save as needed
2479     break;
2480   case T_ARRAY:                 // Really a handle
2481   case T_OBJECT:                // Really a handle
2482       break; // can't de-handlize until after safepoint check
2483   case T_VOID: break;
2484   case T_LONG: break;
2485   default       : ShouldNotReachHere();
2486   }
2487 
2488   Label after_transition;
2489 
2490   // Switch thread to "native transition" state before reading the synchronization state.
2491   // This additional state is necessary because reading and testing the synchronization
2492   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2493   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2494   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2495   //     Thread A is resumed to finish this native method, but doesn't block here since it
2496   //     didn't see any synchronization is progress, and escapes.
2497   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2498 
2499   // Force this write out before the read below
2500   if (!UseSystemMemoryBarrier) {
2501     __ membar(Assembler::Membar_mask_bits(
2502               Assembler::LoadLoad | Assembler::LoadStore |
2503               Assembler::StoreLoad | Assembler::StoreStore));
2504   }
2505 
2506   // check for safepoint operation in progress and/or pending suspend requests
2507   {
2508     Label Continue;
2509     Label slow_path;
2510 
2511     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2512 
2513     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2514     __ jcc(Assembler::equal, Continue);
2515     __ bind(slow_path);
2516 
2517     // Don't use call_VM as it will see a possible pending exception and forward it
2518     // and never return here preventing us from clearing _last_native_pc down below.
2519     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2520     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2521     // by hand.
2522     //
2523     __ vzeroupper();
2524     save_native_result(masm, ret_type, stack_slots);
2525     __ mov(c_rarg0, r15_thread);
2526     __ mov(r12, rsp); // remember sp
2527     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2528     __ andptr(rsp, -16); // align stack as required by ABI
2529     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2530     __ mov(rsp, r12); // restore sp
2531     __ reinit_heapbase();
2532     // Restore any method result value
2533     restore_native_result(masm, ret_type, stack_slots);
2534     __ bind(Continue);
2535   }
2536 
2537   // change thread state
2538   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2539   __ bind(after_transition);
2540 
2541   Label reguard;
2542   Label reguard_done;
2543   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2544   __ jcc(Assembler::equal, reguard);
2545   __ bind(reguard_done);
2546 
2547   // native result if any is live
2548 
2549   // Unlock
2550   Label slow_path_unlock;
2551   Label unlock_done;
2552   if (method->is_synchronized()) {
2553 
2554     Label fast_done;
2555 
2556     // Get locked oop from the handle we passed to jni
2557     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2558 
2559     if (LockingMode == LM_LEGACY) {
2560       Label not_recur;
2561       // Simple recursive lock?
2562       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2563       __ jcc(Assembler::notEqual, not_recur);
2564       __ dec_held_monitor_count();
2565       __ jmpb(fast_done);
2566       __ bind(not_recur);
2567     }
2568 
2569     // Must save rax if it is live now because cmpxchg must use it
2570     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2571       save_native_result(masm, ret_type, stack_slots);
2572     }
2573 
2574     if (LockingMode == LM_MONITOR) {
2575       __ jmp(slow_path_unlock);
2576     } else if (LockingMode == LM_LEGACY) {
2577       // get address of the stack lock
2578       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2579       //  get old displaced header
2580       __ movptr(old_hdr, Address(rax, 0));
2581 
2582       // Atomic swap old header if oop still contains the stack lock
2583       __ lock();
2584       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2585       __ jcc(Assembler::notEqual, slow_path_unlock);
2586       __ dec_held_monitor_count();
2587     } else {
2588       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2589       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2590       __ dec_held_monitor_count();
2591     }
2592 
2593     // slow path re-enters here
2594     __ bind(unlock_done);
2595     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2596       restore_native_result(masm, ret_type, stack_slots);
2597     }
2598 
2599     __ bind(fast_done);
2600   }
2601   {
2602     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2603     save_native_result(masm, ret_type, stack_slots);
2604     __ mov_metadata(c_rarg1, method());
2605     __ call_VM_leaf(
2606          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2607          r15_thread, c_rarg1);
2608     restore_native_result(masm, ret_type, stack_slots);
2609   }
2610 
2611   __ reset_last_Java_frame(false);
2612 
2613   // Unbox oop result, e.g. JNIHandles::resolve value.
2614   if (is_reference_type(ret_type)) {
2615     __ resolve_jobject(rax /* value */,
2616                        r15_thread /* thread */,
2617                        rcx /* tmp */);
2618   }
2619 
2620   if (CheckJNICalls) {
2621     // clear_pending_jni_exception_check
2622     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2623   }
2624 
2625   // reset handle block
2626   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2627   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2628 
2629   // pop our frame
2630 
2631   __ leave();
2632 
2633   // Any exception pending?
2634   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2635   __ jcc(Assembler::notEqual, exception_pending);
2636 
2637   // Return
2638 
2639   __ ret(0);
2640 
2641   // Unexpected paths are out of line and go here
2642 
2643   // forward the exception
2644   __ bind(exception_pending);
2645 
2646   // and forward the exception
2647   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2648 
2649   // Slow path locking & unlocking
2650   if (method->is_synchronized()) {
2651 
2652     // BEGIN Slow path lock
2653     __ bind(slow_path_lock);
2654 
2655     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2656     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2657 
2658     // protect the args we've loaded
2659     save_args(masm, total_c_args, c_arg, out_regs);
2660 
2661     __ mov(c_rarg0, obj_reg);
2662     __ mov(c_rarg1, lock_reg);
2663     __ mov(c_rarg2, r15_thread);
2664 
2665     // Not a leaf but we have last_Java_frame setup as we want
2666     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2667     restore_args(masm, total_c_args, c_arg, out_regs);
2668 
2669 #ifdef ASSERT
2670     { Label L;
2671     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2672     __ jcc(Assembler::equal, L);
2673     __ stop("no pending exception allowed on exit from monitorenter");
2674     __ bind(L);
2675     }
2676 #endif
2677     __ jmp(lock_done);
2678 
2679     // END Slow path lock
2680 
2681     // BEGIN Slow path unlock
2682     __ bind(slow_path_unlock);
2683 
2684     // If we haven't already saved the native result we must save it now as xmm registers
2685     // are still exposed.
2686     __ vzeroupper();
2687     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2688       save_native_result(masm, ret_type, stack_slots);
2689     }
2690 
2691     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2692 
2693     __ mov(c_rarg0, obj_reg);
2694     __ mov(c_rarg2, r15_thread);
2695     __ mov(r12, rsp); // remember sp
2696     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2697     __ andptr(rsp, -16); // align stack as required by ABI
2698 
2699     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2700     // NOTE that obj_reg == rbx currently
2701     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2702     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2703 
2704     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2705     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2706     __ mov(rsp, r12); // restore sp
2707     __ reinit_heapbase();
2708 #ifdef ASSERT
2709     {
2710       Label L;
2711       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2712       __ jcc(Assembler::equal, L);
2713       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2714       __ bind(L);
2715     }
2716 #endif /* ASSERT */
2717 
2718     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2719 
2720     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2721       restore_native_result(masm, ret_type, stack_slots);
2722     }
2723     __ jmp(unlock_done);
2724 
2725     // END Slow path unlock
2726 
2727   } // synchronized
2728 
2729   // SLOW PATH Reguard the stack if needed
2730 
2731   __ bind(reguard);
2732   __ vzeroupper();
2733   save_native_result(masm, ret_type, stack_slots);
2734   __ mov(r12, rsp); // remember sp
2735   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2736   __ andptr(rsp, -16); // align stack as required by ABI
2737   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2738   __ mov(rsp, r12); // restore sp
2739   __ reinit_heapbase();
2740   restore_native_result(masm, ret_type, stack_slots);
2741   // and continue
2742   __ jmp(reguard_done);
2743 
2744 
2745 
2746   __ flush();
2747 
2748   nmethod *nm = nmethod::new_native_nmethod(method,
2749                                             compile_id,
2750                                             masm->code(),
2751                                             vep_offset,
2752                                             frame_complete,
2753                                             stack_slots / VMRegImpl::slots_per_word,
2754                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2755                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2756                                             oop_maps);
2757 
2758   return nm;
2759 }
2760 
2761 // this function returns the adjust size (in number of words) to a c2i adapter
2762 // activation for use during deoptimization
2763 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2764   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2765 }
2766 
2767 
2768 uint SharedRuntime::out_preserve_stack_slots() {
2769   return 0;
2770 }
2771 
2772 
2773 // Number of stack slots between incoming argument block and the start of
2774 // a new frame.  The PROLOG must add this many slots to the stack.  The
2775 // EPILOG must remove this many slots.  amd64 needs two slots for
2776 // return address.
2777 uint SharedRuntime::in_preserve_stack_slots() {
2778   return 4 + 2 * VerifyStackAtCalls;
2779 }
2780 
2781 //------------------------------generate_deopt_blob----------------------------
2782 void SharedRuntime::generate_deopt_blob() {
2783   // Allocate space for the code
2784   ResourceMark rm;
2785   // Setup code generation tools
2786   int pad = 0;
2787   if (UseAVX > 2) {
2788     pad += 1024;
2789   }
2790 #if INCLUDE_JVMCI
2791   if (EnableJVMCI) {
2792     pad += 512; // Increase the buffer size when compiling for JVMCI
2793   }
2794 #endif
2795   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2796   MacroAssembler* masm = new MacroAssembler(&buffer);
2797   int frame_size_in_words;
2798   OopMap* map = nullptr;
2799   OopMapSet *oop_maps = new OopMapSet();
2800 
2801   // -------------
2802   // This code enters when returning to a de-optimized nmethod.  A return
2803   // address has been pushed on the stack, and return values are in
2804   // registers.
2805   // If we are doing a normal deopt then we were called from the patched
2806   // nmethod from the point we returned to the nmethod. So the return
2807   // address on the stack is wrong by NativeCall::instruction_size
2808   // We will adjust the value so it looks like we have the original return
2809   // address on the stack (like when we eagerly deoptimized).
2810   // In the case of an exception pending when deoptimizing, we enter
2811   // with a return address on the stack that points after the call we patched
2812   // into the exception handler. We have the following register state from,
2813   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2814   //    rax: exception oop
2815   //    rbx: exception handler
2816   //    rdx: throwing pc
2817   // So in this case we simply jam rdx into the useless return address and
2818   // the stack looks just like we want.
2819   //
2820   // At this point we need to de-opt.  We save the argument return
2821   // registers.  We call the first C routine, fetch_unroll_info().  This
2822   // routine captures the return values and returns a structure which
2823   // describes the current frame size and the sizes of all replacement frames.
2824   // The current frame is compiled code and may contain many inlined
2825   // functions, each with their own JVM state.  We pop the current frame, then
2826   // push all the new frames.  Then we call the C routine unpack_frames() to
2827   // populate these frames.  Finally unpack_frames() returns us the new target
2828   // address.  Notice that callee-save registers are BLOWN here; they have
2829   // already been captured in the vframeArray at the time the return PC was
2830   // patched.
2831   address start = __ pc();
2832   Label cont;
2833 
2834   // Prolog for non exception case!
2835 
2836   // Save everything in sight.
2837   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2838 
2839   // Normal deoptimization.  Save exec mode for unpack_frames.
2840   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2841   __ jmp(cont);
2842 
2843   int reexecute_offset = __ pc() - start;
2844 #if INCLUDE_JVMCI && !defined(COMPILER1)
2845   if (EnableJVMCI && UseJVMCICompiler) {
2846     // JVMCI does not use this kind of deoptimization
2847     __ should_not_reach_here();
2848   }
2849 #endif
2850 
2851   // Reexecute case
2852   // return address is the pc describes what bci to do re-execute at
2853 
2854   // No need to update map as each call to save_live_registers will produce identical oopmap
2855   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2856 
2857   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2858   __ jmp(cont);
2859 
2860 #if INCLUDE_JVMCI
2861   Label after_fetch_unroll_info_call;
2862   int implicit_exception_uncommon_trap_offset = 0;
2863   int uncommon_trap_offset = 0;
2864 
2865   if (EnableJVMCI) {
2866     implicit_exception_uncommon_trap_offset = __ pc() - start;
2867 
2868     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2869     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2870 
2871     uncommon_trap_offset = __ pc() - start;
2872 
2873     // Save everything in sight.
2874     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2875     // fetch_unroll_info needs to call last_java_frame()
2876     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2877 
2878     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2879     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2880 
2881     __ movl(r14, Deoptimization::Unpack_reexecute);
2882     __ mov(c_rarg0, r15_thread);
2883     __ movl(c_rarg2, r14); // exec mode
2884     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2885     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2886 
2887     __ reset_last_Java_frame(false);
2888 
2889     __ jmp(after_fetch_unroll_info_call);
2890   } // EnableJVMCI
2891 #endif // INCLUDE_JVMCI
2892 
2893   int exception_offset = __ pc() - start;
2894 
2895   // Prolog for exception case
2896 
2897   // all registers are dead at this entry point, except for rax, and
2898   // rdx which contain the exception oop and exception pc
2899   // respectively.  Set them in TLS and fall thru to the
2900   // unpack_with_exception_in_tls entry point.
2901 
2902   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2903   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2904 
2905   int exception_in_tls_offset = __ pc() - start;
2906 
2907   // new implementation because exception oop is now passed in JavaThread
2908 
2909   // Prolog for exception case
2910   // All registers must be preserved because they might be used by LinearScan
2911   // Exceptiop oop and throwing PC are passed in JavaThread
2912   // tos: stack at point of call to method that threw the exception (i.e. only
2913   // args are on the stack, no return address)
2914 
2915   // make room on stack for the return address
2916   // It will be patched later with the throwing pc. The correct value is not
2917   // available now because loading it from memory would destroy registers.
2918   __ push(0);
2919 
2920   // Save everything in sight.
2921   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2922 
2923   // Now it is safe to overwrite any register
2924 
2925   // Deopt during an exception.  Save exec mode for unpack_frames.
2926   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2927 
2928   // load throwing pc from JavaThread and patch it as the return address
2929   // of the current frame. Then clear the field in JavaThread
2930 
2931   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2932   __ movptr(Address(rbp, wordSize), rdx);
2933   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2934 
2935 #ifdef ASSERT
2936   // verify that there is really an exception oop in JavaThread
2937   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2938   __ verify_oop(rax);
2939 
2940   // verify that there is no pending exception
2941   Label no_pending_exception;
2942   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2943   __ testptr(rax, rax);
2944   __ jcc(Assembler::zero, no_pending_exception);
2945   __ stop("must not have pending exception here");
2946   __ bind(no_pending_exception);
2947 #endif
2948 
2949   __ bind(cont);
2950 
2951   // Call C code.  Need thread and this frame, but NOT official VM entry
2952   // crud.  We cannot block on this call, no GC can happen.
2953   //
2954   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2955 
2956   // fetch_unroll_info needs to call last_java_frame().
2957 
2958   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2959 #ifdef ASSERT
2960   { Label L;
2961     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2962     __ jcc(Assembler::equal, L);
2963     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2964     __ bind(L);
2965   }
2966 #endif // ASSERT
2967   __ mov(c_rarg0, r15_thread);
2968   __ movl(c_rarg1, r14); // exec_mode
2969   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2970 
2971   // Need to have an oopmap that tells fetch_unroll_info where to
2972   // find any register it might need.
2973   oop_maps->add_gc_map(__ pc() - start, map);
2974 
2975   __ reset_last_Java_frame(false);
2976 
2977 #if INCLUDE_JVMCI
2978   if (EnableJVMCI) {
2979     __ bind(after_fetch_unroll_info_call);
2980   }
2981 #endif
2982 
2983   // Load UnrollBlock* into rdi
2984   __ mov(rdi, rax);
2985 
2986   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2987    Label noException;
2988   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2989   __ jcc(Assembler::notEqual, noException);
2990   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2991   // QQQ this is useless it was null above
2992   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2993   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2994   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2995 
2996   __ verify_oop(rax);
2997 
2998   // Overwrite the result registers with the exception results.
2999   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3000   // I think this is useless
3001   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
3002 
3003   __ bind(noException);
3004 
3005   // Only register save data is on the stack.
3006   // Now restore the result registers.  Everything else is either dead
3007   // or captured in the vframeArray.
3008   RegisterSaver::restore_result_registers(masm);
3009 
3010   // All of the register save area has been popped of the stack. Only the
3011   // return address remains.
3012 
3013   // Pop all the frames we must move/replace.
3014   //
3015   // Frame picture (youngest to oldest)
3016   // 1: self-frame (no frame link)
3017   // 2: deopting frame  (no frame link)
3018   // 3: caller of deopting frame (could be compiled/interpreted).
3019   //
3020   // Note: by leaving the return address of self-frame on the stack
3021   // and using the size of frame 2 to adjust the stack
3022   // when we are done the return to frame 3 will still be on the stack.
3023 
3024   // Pop deoptimized frame
3025   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3026   __ addptr(rsp, rcx);
3027 
3028   // rsp should be pointing at the return address to the caller (3)
3029 
3030   // Pick up the initial fp we should save
3031   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3032   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3033 
3034 #ifdef ASSERT
3035   // Compilers generate code that bang the stack by as much as the
3036   // interpreter would need. So this stack banging should never
3037   // trigger a fault. Verify that it does not on non product builds.
3038   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3039   __ bang_stack_size(rbx, rcx);
3040 #endif
3041 
3042   // Load address of array of frame pcs into rcx
3043   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3044 
3045   // Trash the old pc
3046   __ addptr(rsp, wordSize);
3047 
3048   // Load address of array of frame sizes into rsi
3049   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3050 
3051   // Load counter into rdx
3052   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3053 
3054   // Now adjust the caller's stack to make up for the extra locals
3055   // but record the original sp so that we can save it in the skeletal interpreter
3056   // frame and the stack walking of interpreter_sender will get the unextended sp
3057   // value and not the "real" sp value.
3058 
3059   const Register sender_sp = r8;
3060 
3061   __ mov(sender_sp, rsp);
3062   __ movl(rbx, Address(rdi,
3063                        Deoptimization::UnrollBlock::
3064                        caller_adjustment_offset()));
3065   __ subptr(rsp, rbx);
3066 
3067   // Push interpreter frames in a loop
3068   Label loop;
3069   __ bind(loop);
3070   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3071   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3072   __ pushptr(Address(rcx, 0));          // Save return address
3073   __ enter();                           // Save old & set new ebp
3074   __ subptr(rsp, rbx);                  // Prolog
3075   // This value is corrected by layout_activation_impl
3076   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3077   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3078   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3079   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3080   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3081   __ decrementl(rdx);                   // Decrement counter
3082   __ jcc(Assembler::notZero, loop);
3083   __ pushptr(Address(rcx, 0));          // Save final return address
3084 
3085   // Re-push self-frame
3086   __ enter();                           // Save old & set new ebp
3087 
3088   // Allocate a full sized register save area.
3089   // Return address and rbp are in place, so we allocate two less words.
3090   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3091 
3092   // Restore frame locals after moving the frame
3093   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3094   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3095 
3096   // Call C code.  Need thread but NOT official VM entry
3097   // crud.  We cannot block on this call, no GC can happen.  Call should
3098   // restore return values to their stack-slots with the new SP.
3099   //
3100   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3101 
3102   // Use rbp because the frames look interpreted now
3103   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3104   // Don't need the precise return PC here, just precise enough to point into this code blob.
3105   address the_pc = __ pc();
3106   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3107 
3108   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3109   __ mov(c_rarg0, r15_thread);
3110   __ movl(c_rarg1, r14); // second arg: exec_mode
3111   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3112   // Revert SP alignment after call since we're going to do some SP relative addressing below
3113   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3114 
3115   // Set an oopmap for the call site
3116   // Use the same PC we used for the last java frame
3117   oop_maps->add_gc_map(the_pc - start,
3118                        new OopMap( frame_size_in_words, 0 ));
3119 
3120   // Clear fp AND pc
3121   __ reset_last_Java_frame(true);
3122 
3123   // Collect return values
3124   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3125   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3126   // I think this is useless (throwing pc?)
3127   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3128 
3129   // Pop self-frame.
3130   __ leave();                           // Epilog
3131 
3132   // Jump to interpreter
3133   __ ret(0);
3134 
3135   // Make sure all code is generated
3136   masm->flush();
3137 
3138   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3139   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3140 #if INCLUDE_JVMCI
3141   if (EnableJVMCI) {
3142     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3143     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3144   }
3145 #endif
3146 }
3147 
3148 #ifdef COMPILER2
3149 //------------------------------generate_uncommon_trap_blob--------------------
3150 void SharedRuntime::generate_uncommon_trap_blob() {
3151   // Allocate space for the code
3152   ResourceMark rm;
3153   // Setup code generation tools
3154   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
3155   MacroAssembler* masm = new MacroAssembler(&buffer);
3156 
3157   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3158 
3159   address start = __ pc();
3160 
3161   if (UseRTMLocking) {
3162     // Abort RTM transaction before possible nmethod deoptimization.
3163     __ xabort(0);
3164   }
3165 
3166   // Push self-frame.  We get here with a return address on the
3167   // stack, so rsp is 8-byte aligned until we allocate our frame.
3168   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
3169 
3170   // No callee saved registers. rbp is assumed implicitly saved
3171   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3172 
3173   // compiler left unloaded_class_index in j_rarg0 move to where the
3174   // runtime expects it.
3175   __ movl(c_rarg1, j_rarg0);
3176 
3177   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3178 
3179   // Call C code.  Need thread but NOT official VM entry
3180   // crud.  We cannot block on this call, no GC can happen.  Call should
3181   // capture callee-saved registers as well as return values.
3182   // Thread is in rdi already.
3183   //
3184   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
3185 
3186   __ mov(c_rarg0, r15_thread);
3187   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
3188   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3189 
3190   // Set an oopmap for the call site
3191   OopMapSet* oop_maps = new OopMapSet();
3192   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
3193 
3194   // location of rbp is known implicitly by the frame sender code
3195 
3196   oop_maps->add_gc_map(__ pc() - start, map);
3197 
3198   __ reset_last_Java_frame(false);
3199 
3200   // Load UnrollBlock* into rdi
3201   __ mov(rdi, rax);
3202 
3203 #ifdef ASSERT
3204   { Label L;
3205     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
3206               Deoptimization::Unpack_uncommon_trap);
3207     __ jcc(Assembler::equal, L);
3208     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
3209     __ bind(L);
3210   }
3211 #endif
3212 
3213   // Pop all the frames we must move/replace.
3214   //
3215   // Frame picture (youngest to oldest)
3216   // 1: self-frame (no frame link)
3217   // 2: deopting frame  (no frame link)
3218   // 3: caller of deopting frame (could be compiled/interpreted).
3219 
3220   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
3221   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
3222 
3223   // Pop deoptimized frame (int)
3224   __ movl(rcx, Address(rdi,
3225                        Deoptimization::UnrollBlock::
3226                        size_of_deoptimized_frame_offset()));
3227   __ addptr(rsp, rcx);
3228 
3229   // rsp should be pointing at the return address to the caller (3)
3230 
3231   // Pick up the initial fp we should save
3232   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3233   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3234 
3235 #ifdef ASSERT
3236   // Compilers generate code that bang the stack by as much as the
3237   // interpreter would need. So this stack banging should never
3238   // trigger a fault. Verify that it does not on non product builds.
3239   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3240   __ bang_stack_size(rbx, rcx);
3241 #endif
3242 
3243   // Load address of array of frame pcs into rcx (address*)
3244   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3245 
3246   // Trash the return pc
3247   __ addptr(rsp, wordSize);
3248 
3249   // Load address of array of frame sizes into rsi (intptr_t*)
3250   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
3251 
3252   // Counter
3253   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
3254 
3255   // Now adjust the caller's stack to make up for the extra locals but
3256   // record the original sp so that we can save it in the skeletal
3257   // interpreter frame and the stack walking of interpreter_sender
3258   // will get the unextended sp value and not the "real" sp value.
3259 
3260   const Register sender_sp = r8;
3261 
3262   __ mov(sender_sp, rsp);
3263   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
3264   __ subptr(rsp, rbx);
3265 
3266   // Push interpreter frames in a loop
3267   Label loop;
3268   __ bind(loop);
3269   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3270   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3271   __ pushptr(Address(rcx, 0));     // Save return address
3272   __ enter();                      // Save old & set new rbp
3273   __ subptr(rsp, rbx);             // Prolog
3274   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3275             sender_sp);            // Make it walkable
3276   // This value is corrected by layout_activation_impl
3277   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3278   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3279   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3280   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3281   __ decrementl(rdx);              // Decrement counter
3282   __ jcc(Assembler::notZero, loop);
3283   __ pushptr(Address(rcx, 0));     // Save final return address
3284 
3285   // Re-push self-frame
3286   __ enter();                 // Save old & set new rbp
3287   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3288                               // Prolog
3289 
3290   // Use rbp because the frames look interpreted now
3291   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3292   // Don't need the precise return PC here, just precise enough to point into this code blob.
3293   address the_pc = __ pc();
3294   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3295 
3296   // Call C code.  Need thread but NOT official VM entry
3297   // crud.  We cannot block on this call, no GC can happen.  Call should
3298   // restore return values to their stack-slots with the new SP.
3299   // Thread is in rdi already.
3300   //
3301   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3302 
3303   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3304   __ mov(c_rarg0, r15_thread);
3305   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3306   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3307 
3308   // Set an oopmap for the call site
3309   // Use the same PC we used for the last java frame
3310   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3311 
3312   // Clear fp AND pc
3313   __ reset_last_Java_frame(true);
3314 
3315   // Pop self-frame.
3316   __ leave();                 // Epilog
3317 
3318   // Jump to interpreter
3319   __ ret(0);
3320 
3321   // Make sure all code is generated
3322   masm->flush();
3323 
3324   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3325                                                  SimpleRuntimeFrame::framesize >> 1);
3326 }
3327 #endif // COMPILER2
3328 
3329 //------------------------------generate_handler_blob------
3330 //
3331 // Generate a special Compile2Runtime blob that saves all registers,
3332 // and setup oopmap.
3333 //
3334 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3335   assert(StubRoutines::forward_exception_entry() != nullptr,
3336          "must be generated before");
3337 
3338   ResourceMark rm;
3339   OopMapSet *oop_maps = new OopMapSet();
3340   OopMap* map;
3341 
3342   // Allocate space for the code.  Setup code generation tools.
3343   CodeBuffer buffer("handler_blob", 2048, 1024);
3344   MacroAssembler* masm = new MacroAssembler(&buffer);
3345 
3346   address start   = __ pc();
3347   address call_pc = nullptr;
3348   int frame_size_in_words;
3349   bool cause_return = (poll_type == POLL_AT_RETURN);
3350   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3351 
3352   if (UseRTMLocking) {
3353     // Abort RTM transaction before calling runtime
3354     // because critical section will be large and will be
3355     // aborted anyway. Also nmethod could be deoptimized.
3356     __ xabort(0);
3357   }
3358 
3359   // Make room for return address (or push it again)
3360   if (!cause_return) {
3361     __ push(rbx);
3362   }
3363 
3364   // Save registers, fpu state, and flags
3365   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3366 
3367   // The following is basically a call_VM.  However, we need the precise
3368   // address of the call in order to generate an oopmap. Hence, we do all the
3369   // work ourselves.
3370 
3371   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3372 
3373   // The return address must always be correct so that frame constructor never
3374   // sees an invalid pc.
3375 
3376   if (!cause_return) {
3377     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3378     // Additionally, rbx is a callee saved register and we can look at it later to determine
3379     // if someone changed the return address for us!
3380     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3381     __ movptr(Address(rbp, wordSize), rbx);
3382   }
3383 
3384   // Do the call
3385   __ mov(c_rarg0, r15_thread);
3386   __ call(RuntimeAddress(call_ptr));
3387 
3388   // Set an oopmap for the call site.  This oopmap will map all
3389   // oop-registers and debug-info registers as callee-saved.  This
3390   // will allow deoptimization at this safepoint to find all possible
3391   // debug-info recordings, as well as let GC find all oops.
3392 
3393   oop_maps->add_gc_map( __ pc() - start, map);
3394 
3395   Label noException;
3396 
3397   __ reset_last_Java_frame(false);
3398 
3399   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3400   __ jcc(Assembler::equal, noException);
3401 
3402   // Exception pending
3403 
3404   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3405 
3406   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3407 
3408   // No exception case
3409   __ bind(noException);
3410 
3411   Label no_adjust;
3412 #ifdef ASSERT
3413   Label bail;
3414 #endif
3415   if (!cause_return) {
3416     Label no_prefix, not_special;
3417 
3418     // If our stashed return pc was modified by the runtime we avoid touching it
3419     __ cmpptr(rbx, Address(rbp, wordSize));
3420     __ jccb(Assembler::notEqual, no_adjust);
3421 
3422     // Skip over the poll instruction.
3423     // See NativeInstruction::is_safepoint_poll()
3424     // Possible encodings:
3425     //      85 00       test   %eax,(%rax)
3426     //      85 01       test   %eax,(%rcx)
3427     //      85 02       test   %eax,(%rdx)
3428     //      85 03       test   %eax,(%rbx)
3429     //      85 06       test   %eax,(%rsi)
3430     //      85 07       test   %eax,(%rdi)
3431     //
3432     //   41 85 00       test   %eax,(%r8)
3433     //   41 85 01       test   %eax,(%r9)
3434     //   41 85 02       test   %eax,(%r10)
3435     //   41 85 03       test   %eax,(%r11)
3436     //   41 85 06       test   %eax,(%r14)
3437     //   41 85 07       test   %eax,(%r15)
3438     //
3439     //      85 04 24    test   %eax,(%rsp)
3440     //   41 85 04 24    test   %eax,(%r12)
3441     //      85 45 00    test   %eax,0x0(%rbp)
3442     //   41 85 45 00    test   %eax,0x0(%r13)
3443 
3444     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3445     __ jcc(Assembler::notEqual, no_prefix);
3446     __ addptr(rbx, 1);
3447     __ bind(no_prefix);
3448 #ifdef ASSERT
3449     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3450 #endif
3451     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3452     // r12/rsp 0x04
3453     // r13/rbp 0x05
3454     __ movzbq(rcx, Address(rbx, 1));
3455     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3456     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3457     __ cmpptr(rcx, 1);
3458     __ jcc(Assembler::above, not_special);
3459     __ addptr(rbx, 1);
3460     __ bind(not_special);
3461 #ifdef ASSERT
3462     // Verify the correct encoding of the poll we're about to skip.
3463     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3464     __ jcc(Assembler::notEqual, bail);
3465     // Mask out the modrm bits
3466     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3467     // rax encodes to 0, so if the bits are nonzero it's incorrect
3468     __ jcc(Assembler::notZero, bail);
3469 #endif
3470     // Adjust return pc forward to step over the safepoint poll instruction
3471     __ addptr(rbx, 2);
3472     __ movptr(Address(rbp, wordSize), rbx);
3473   }
3474 
3475   __ bind(no_adjust);
3476   // Normal exit, restore registers and exit.
3477   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3478   __ ret(0);
3479 
3480 #ifdef ASSERT
3481   __ bind(bail);
3482   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3483 #endif
3484 
3485   // Make sure all code is generated
3486   masm->flush();
3487 
3488   // Fill-out other meta info
3489   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3490 }
3491 
3492 //
3493 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3494 //
3495 // Generate a stub that calls into vm to find out the proper destination
3496 // of a java call. All the argument registers are live at this point
3497 // but since this is generic code we don't know what they are and the caller
3498 // must do any gc of the args.
3499 //
3500 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3501   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3502 
3503   // allocate space for the code
3504   ResourceMark rm;
3505 
3506   CodeBuffer buffer(name, 1200, 512);
3507   MacroAssembler* masm = new MacroAssembler(&buffer);
3508 
3509   int frame_size_in_words;
3510 
3511   OopMapSet *oop_maps = new OopMapSet();
3512   OopMap* map = nullptr;
3513 
3514   int start = __ offset();
3515 
3516   // No need to save vector registers since they are caller-saved anyway.
3517   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3518 
3519   int frame_complete = __ offset();
3520 
3521   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3522 
3523   __ mov(c_rarg0, r15_thread);
3524 
3525   __ call(RuntimeAddress(destination));
3526 
3527 
3528   // Set an oopmap for the call site.
3529   // We need this not only for callee-saved registers, but also for volatile
3530   // registers that the compiler might be keeping live across a safepoint.
3531 
3532   oop_maps->add_gc_map( __ offset() - start, map);
3533 
3534   // rax contains the address we are going to jump to assuming no exception got installed
3535 
3536   // clear last_Java_sp
3537   __ reset_last_Java_frame(false);
3538   // check for pending exceptions
3539   Label pending;
3540   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3541   __ jcc(Assembler::notEqual, pending);
3542 
3543   // get the returned Method*
3544   __ get_vm_result_2(rbx, r15_thread);
3545   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3546 
3547   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3548 
3549   RegisterSaver::restore_live_registers(masm);
3550 
3551   // We are back to the original state on entry and ready to go.
3552 
3553   __ jmp(rax);
3554 
3555   // Pending exception after the safepoint
3556 
3557   __ bind(pending);
3558 
3559   RegisterSaver::restore_live_registers(masm);
3560 
3561   // exception pending => remove activation and forward to exception handler
3562 
3563   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3564 
3565   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3566   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3567 
3568   // -------------
3569   // make sure all code is generated
3570   masm->flush();
3571 
3572   // return the  blob
3573   // frame_size_words or bytes??
3574   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3575 }
3576 
3577 //------------------------------Montgomery multiplication------------------------
3578 //
3579 
3580 #ifndef _WINDOWS
3581 
3582 // Subtract 0:b from carry:a.  Return carry.
3583 static julong
3584 sub(julong a[], julong b[], julong carry, long len) {
3585   long long i = 0, cnt = len;
3586   julong tmp;
3587   asm volatile("clc; "
3588                "0: ; "
3589                "mov (%[b], %[i], 8), %[tmp]; "
3590                "sbb %[tmp], (%[a], %[i], 8); "
3591                "inc %[i]; dec %[cnt]; "
3592                "jne 0b; "
3593                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3594                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3595                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3596                : "memory");
3597   return tmp;
3598 }
3599 
3600 // Multiply (unsigned) Long A by Long B, accumulating the double-
3601 // length result into the accumulator formed of T0, T1, and T2.
3602 #define MACC(A, B, T0, T1, T2)                                  \
3603 do {                                                            \
3604   unsigned long hi, lo;                                         \
3605   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3606            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3607            : "r"(A), "a"(B) : "cc");                            \
3608  } while(0)
3609 
3610 // As above, but add twice the double-length result into the
3611 // accumulator.
3612 #define MACC2(A, B, T0, T1, T2)                                 \
3613 do {                                                            \
3614   unsigned long hi, lo;                                         \
3615   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3616            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3617            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3618            : "r"(A), "a"(B) : "cc");                            \
3619  } while(0)
3620 
3621 #else //_WINDOWS
3622 
3623 static julong
3624 sub(julong a[], julong b[], julong carry, long len) {
3625   long i;
3626   julong tmp;
3627   unsigned char c = 1;
3628   for (i = 0; i < len; i++) {
3629     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3630     a[i] = tmp;
3631   }
3632   c = _addcarry_u64(c, carry, ~0, &tmp);
3633   return tmp;
3634 }
3635 
3636 // Multiply (unsigned) Long A by Long B, accumulating the double-
3637 // length result into the accumulator formed of T0, T1, and T2.
3638 #define MACC(A, B, T0, T1, T2)                          \
3639 do {                                                    \
3640   julong hi, lo;                            \
3641   lo = _umul128(A, B, &hi);                             \
3642   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3643   c = _addcarry_u64(c, hi, T1, &T1);                    \
3644   _addcarry_u64(c, T2, 0, &T2);                         \
3645  } while(0)
3646 
3647 // As above, but add twice the double-length result into the
3648 // accumulator.
3649 #define MACC2(A, B, T0, T1, T2)                         \
3650 do {                                                    \
3651   julong hi, lo;                            \
3652   lo = _umul128(A, B, &hi);                             \
3653   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3654   c = _addcarry_u64(c, hi, T1, &T1);                    \
3655   _addcarry_u64(c, T2, 0, &T2);                         \
3656   c = _addcarry_u64(0, lo, T0, &T0);                    \
3657   c = _addcarry_u64(c, hi, T1, &T1);                    \
3658   _addcarry_u64(c, T2, 0, &T2);                         \
3659  } while(0)
3660 
3661 #endif //_WINDOWS
3662 
3663 // Fast Montgomery multiplication.  The derivation of the algorithm is
3664 // in  A Cryptographic Library for the Motorola DSP56000,
3665 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3666 
3667 static void NOINLINE
3668 montgomery_multiply(julong a[], julong b[], julong n[],
3669                     julong m[], julong inv, int len) {
3670   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3671   int i;
3672 
3673   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3674 
3675   for (i = 0; i < len; i++) {
3676     int j;
3677     for (j = 0; j < i; j++) {
3678       MACC(a[j], b[i-j], t0, t1, t2);
3679       MACC(m[j], n[i-j], t0, t1, t2);
3680     }
3681     MACC(a[i], b[0], t0, t1, t2);
3682     m[i] = t0 * inv;
3683     MACC(m[i], n[0], t0, t1, t2);
3684 
3685     assert(t0 == 0, "broken Montgomery multiply");
3686 
3687     t0 = t1; t1 = t2; t2 = 0;
3688   }
3689 
3690   for (i = len; i < 2*len; i++) {
3691     int j;
3692     for (j = i-len+1; j < len; j++) {
3693       MACC(a[j], b[i-j], t0, t1, t2);
3694       MACC(m[j], n[i-j], t0, t1, t2);
3695     }
3696     m[i-len] = t0;
3697     t0 = t1; t1 = t2; t2 = 0;
3698   }
3699 
3700   while (t0)
3701     t0 = sub(m, n, t0, len);
3702 }
3703 
3704 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3705 // multiplies so it should be up to 25% faster than Montgomery
3706 // multiplication.  However, its loop control is more complex and it
3707 // may actually run slower on some machines.
3708 
3709 static void NOINLINE
3710 montgomery_square(julong a[], julong n[],
3711                   julong m[], julong inv, int len) {
3712   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3713   int i;
3714 
3715   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3716 
3717   for (i = 0; i < len; i++) {
3718     int j;
3719     int end = (i+1)/2;
3720     for (j = 0; j < end; j++) {
3721       MACC2(a[j], a[i-j], t0, t1, t2);
3722       MACC(m[j], n[i-j], t0, t1, t2);
3723     }
3724     if ((i & 1) == 0) {
3725       MACC(a[j], a[j], t0, t1, t2);
3726     }
3727     for (; j < i; j++) {
3728       MACC(m[j], n[i-j], t0, t1, t2);
3729     }
3730     m[i] = t0 * inv;
3731     MACC(m[i], n[0], t0, t1, t2);
3732 
3733     assert(t0 == 0, "broken Montgomery square");
3734 
3735     t0 = t1; t1 = t2; t2 = 0;
3736   }
3737 
3738   for (i = len; i < 2*len; i++) {
3739     int start = i-len+1;
3740     int end = start + (len - start)/2;
3741     int j;
3742     for (j = start; j < end; j++) {
3743       MACC2(a[j], a[i-j], t0, t1, t2);
3744       MACC(m[j], n[i-j], t0, t1, t2);
3745     }
3746     if ((i & 1) == 0) {
3747       MACC(a[j], a[j], t0, t1, t2);
3748     }
3749     for (; j < len; j++) {
3750       MACC(m[j], n[i-j], t0, t1, t2);
3751     }
3752     m[i-len] = t0;
3753     t0 = t1; t1 = t2; t2 = 0;
3754   }
3755 
3756   while (t0)
3757     t0 = sub(m, n, t0, len);
3758 }
3759 
3760 // Swap words in a longword.
3761 static julong swap(julong x) {
3762   return (x << 32) | (x >> 32);
3763 }
3764 
3765 // Copy len longwords from s to d, word-swapping as we go.  The
3766 // destination array is reversed.
3767 static void reverse_words(julong *s, julong *d, int len) {
3768   d += len;
3769   while(len-- > 0) {
3770     d--;
3771     *d = swap(*s);
3772     s++;
3773   }
3774 }
3775 
3776 // The threshold at which squaring is advantageous was determined
3777 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3778 #define MONTGOMERY_SQUARING_THRESHOLD 64
3779 
3780 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3781                                         jint len, jlong inv,
3782                                         jint *m_ints) {
3783   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3784   int longwords = len/2;
3785 
3786   // Make very sure we don't use so much space that the stack might
3787   // overflow.  512 jints corresponds to an 16384-bit integer and
3788   // will use here a total of 8k bytes of stack space.
3789   int divisor = sizeof(julong) * 4;
3790   guarantee(longwords <= 8192 / divisor, "must be");
3791   int total_allocation = longwords * sizeof (julong) * 4;
3792   julong *scratch = (julong *)alloca(total_allocation);
3793 
3794   // Local scratch arrays
3795   julong
3796     *a = scratch + 0 * longwords,
3797     *b = scratch + 1 * longwords,
3798     *n = scratch + 2 * longwords,
3799     *m = scratch + 3 * longwords;
3800 
3801   reverse_words((julong *)a_ints, a, longwords);
3802   reverse_words((julong *)b_ints, b, longwords);
3803   reverse_words((julong *)n_ints, n, longwords);
3804 
3805   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3806 
3807   reverse_words(m, (julong *)m_ints, longwords);
3808 }
3809 
3810 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3811                                       jint len, jlong inv,
3812                                       jint *m_ints) {
3813   assert(len % 2 == 0, "array length in montgomery_square must be even");
3814   int longwords = len/2;
3815 
3816   // Make very sure we don't use so much space that the stack might
3817   // overflow.  512 jints corresponds to an 16384-bit integer and
3818   // will use here a total of 6k bytes of stack space.
3819   int divisor = sizeof(julong) * 3;
3820   guarantee(longwords <= (8192 / divisor), "must be");
3821   int total_allocation = longwords * sizeof (julong) * 3;
3822   julong *scratch = (julong *)alloca(total_allocation);
3823 
3824   // Local scratch arrays
3825   julong
3826     *a = scratch + 0 * longwords,
3827     *n = scratch + 1 * longwords,
3828     *m = scratch + 2 * longwords;
3829 
3830   reverse_words((julong *)a_ints, a, longwords);
3831   reverse_words((julong *)n_ints, n, longwords);
3832 
3833   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3834     ::montgomery_square(a, n, m, (julong)inv, longwords);
3835   } else {
3836     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3837   }
3838 
3839   reverse_words(m, (julong *)m_ints, longwords);
3840 }
3841 
3842 #ifdef COMPILER2
3843 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3844 //
3845 //------------------------------generate_exception_blob---------------------------
3846 // creates exception blob at the end
3847 // Using exception blob, this code is jumped from a compiled method.
3848 // (see emit_exception_handler in x86_64.ad file)
3849 //
3850 // Given an exception pc at a call we call into the runtime for the
3851 // handler in this method. This handler might merely restore state
3852 // (i.e. callee save registers) unwind the frame and jump to the
3853 // exception handler for the nmethod if there is no Java level handler
3854 // for the nmethod.
3855 //
3856 // This code is entered with a jmp.
3857 //
3858 // Arguments:
3859 //   rax: exception oop
3860 //   rdx: exception pc
3861 //
3862 // Results:
3863 //   rax: exception oop
3864 //   rdx: exception pc in caller or ???
3865 //   destination: exception handler of caller
3866 //
3867 // Note: the exception pc MUST be at a call (precise debug information)
3868 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3869 //
3870 
3871 void OptoRuntime::generate_exception_blob() {
3872   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3873   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3874   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3875 
3876   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3877 
3878   // Allocate space for the code
3879   ResourceMark rm;
3880   // Setup code generation tools
3881   CodeBuffer buffer("exception_blob", 2048, 1024);
3882   MacroAssembler* masm = new MacroAssembler(&buffer);
3883 
3884 
3885   address start = __ pc();
3886 
3887   // Exception pc is 'return address' for stack walker
3888   __ push(rdx);
3889   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3890 
3891   // Save callee-saved registers.  See x86_64.ad.
3892 
3893   // rbp is an implicitly saved callee saved register (i.e., the calling
3894   // convention will save/restore it in the prolog/epilog). Other than that
3895   // there are no callee save registers now that adapter frames are gone.
3896 
3897   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3898 
3899   // Store exception in Thread object. We cannot pass any arguments to the
3900   // handle_exception call, since we do not want to make any assumption
3901   // about the size of the frame where the exception happened in.
3902   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3903   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3904   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3905 
3906   // This call does all the hard work.  It checks if an exception handler
3907   // exists in the method.
3908   // If so, it returns the handler address.
3909   // If not, it prepares for stack-unwinding, restoring the callee-save
3910   // registers of the frame being removed.
3911   //
3912   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3913 
3914   // At a method handle call, the stack may not be properly aligned
3915   // when returning with an exception.
3916   address the_pc = __ pc();
3917   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3918   __ mov(c_rarg0, r15_thread);
3919   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3920   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3921 
3922   // Set an oopmap for the call site.  This oopmap will only be used if we
3923   // are unwinding the stack.  Hence, all locations will be dead.
3924   // Callee-saved registers will be the same as the frame above (i.e.,
3925   // handle_exception_stub), since they were restored when we got the
3926   // exception.
3927 
3928   OopMapSet* oop_maps = new OopMapSet();
3929 
3930   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3931 
3932   __ reset_last_Java_frame(false);
3933 
3934   // Restore callee-saved registers
3935 
3936   // rbp is an implicitly saved callee-saved register (i.e., the calling
3937   // convention will save restore it in prolog/epilog) Other than that
3938   // there are no callee save registers now that adapter frames are gone.
3939 
3940   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3941 
3942   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3943   __ pop(rdx);                  // No need for exception pc anymore
3944 
3945   // rax: exception handler
3946 
3947   // We have a handler in rax (could be deopt blob).
3948   __ mov(r8, rax);
3949 
3950   // Get the exception oop
3951   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3952   // Get the exception pc in case we are deoptimized
3953   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3954 #ifdef ASSERT
3955   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3956   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3957 #endif
3958   // Clear the exception oop so GC no longer processes it as a root.
3959   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3960 
3961   // rax: exception oop
3962   // r8:  exception handler
3963   // rdx: exception pc
3964   // Jump to handler
3965 
3966   __ jmp(r8);
3967 
3968   // Make sure all code is generated
3969   masm->flush();
3970 
3971   // Set exception blob
3972   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3973 }
3974 #endif // COMPILER2
3975 
3976 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3977   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3978   CodeBuffer buffer(buf);
3979   short buffer_locs[20];
3980   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3981                                          sizeof(buffer_locs)/sizeof(relocInfo));
3982 
3983   MacroAssembler* masm = new MacroAssembler(&buffer);
3984 
3985   const Array<SigEntry>* sig_vk = vk->extended_sig();
3986   const Array<VMRegPair>* regs = vk->return_regs();
3987 
3988   int pack_fields_jobject_off = __ offset();
3989   // Resolve pre-allocated buffer from JNI handle.
3990   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3991   __ movptr(rax, Address(r13, 0));
3992   __ resolve_jobject(rax /* value */,
3993                      r15_thread /* thread */,
3994                      r12 /* tmp */);
3995   __ movptr(Address(r13, 0), rax);
3996 
3997   int pack_fields_off = __ offset();
3998 
3999   int j = 1;
4000   for (int i = 0; i < sig_vk->length(); i++) {
4001     BasicType bt = sig_vk->at(i)._bt;
4002     if (bt == T_METADATA) {
4003       continue;
4004     }
4005     if (bt == T_VOID) {
4006       if (sig_vk->at(i-1)._bt == T_LONG ||
4007           sig_vk->at(i-1)._bt == T_DOUBLE) {
4008         j++;
4009       }
4010       continue;
4011     }
4012     int off = sig_vk->at(i)._offset;
4013     assert(off > 0, "offset in object should be positive");
4014     VMRegPair pair = regs->at(j);
4015     VMReg r_1 = pair.first();
4016     VMReg r_2 = pair.second();
4017     Address to(rax, off);
4018     if (bt == T_FLOAT) {
4019       __ movflt(to, r_1->as_XMMRegister());
4020     } else if (bt == T_DOUBLE) {
4021       __ movdbl(to, r_1->as_XMMRegister());
4022     } else {
4023       Register val = r_1->as_Register();
4024       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
4025       if (is_reference_type(bt)) {
4026         __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
4027       } else {
4028         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
4029       }
4030     }
4031     j++;
4032   }
4033   assert(j == regs->length(), "missed a field?");
4034 
4035   __ ret(0);
4036 
4037   int unpack_fields_off = __ offset();
4038 
4039   Label skip;
4040   __ testptr(rax, rax);
4041   __ jcc(Assembler::zero, skip);
4042 
4043   j = 1;
4044   for (int i = 0; i < sig_vk->length(); i++) {
4045     BasicType bt = sig_vk->at(i)._bt;
4046     if (bt == T_METADATA) {
4047       continue;
4048     }
4049     if (bt == T_VOID) {
4050       if (sig_vk->at(i-1)._bt == T_LONG ||
4051           sig_vk->at(i-1)._bt == T_DOUBLE) {
4052         j++;
4053       }
4054       continue;
4055     }
4056     int off = sig_vk->at(i)._offset;
4057     assert(off > 0, "offset in object should be positive");
4058     VMRegPair pair = regs->at(j);
4059     VMReg r_1 = pair.first();
4060     VMReg r_2 = pair.second();
4061     Address from(rax, off);
4062     if (bt == T_FLOAT) {
4063       __ movflt(r_1->as_XMMRegister(), from);
4064     } else if (bt == T_DOUBLE) {
4065       __ movdbl(r_1->as_XMMRegister(), from);
4066     } else if (bt == T_OBJECT || bt == T_ARRAY) {
4067       assert_different_registers(rax, r_1->as_Register());
4068       __ load_heap_oop(r_1->as_Register(), from);
4069     } else {
4070       assert(is_java_primitive(bt), "unexpected basic type");
4071       assert_different_registers(rax, r_1->as_Register());
4072       size_t size_in_bytes = type2aelembytes(bt);
4073       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
4074     }
4075     j++;
4076   }
4077   assert(j == regs->length(), "missed a field?");
4078 
4079   __ bind(skip);
4080   __ ret(0);
4081 
4082   __ flush();
4083 
4084   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
4085 }