1 /*
   2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "code/aotCodeCache.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/timerTrace.hpp"
  56 #include "runtime/vframeArray.hpp"
  57 #include "runtime/vm_version.hpp"
  58 #include "utilities/align.hpp"
  59 #include "utilities/checkedCast.hpp"
  60 #include "utilities/formatBuffer.hpp"
  61 #include "vmreg_x86.inline.hpp"
  62 #ifdef COMPILER1
  63 #include "c1/c1_Runtime1.hpp"
  64 #endif
  65 #ifdef COMPILER2
  66 #include "opto/runtime.hpp"
  67 #endif
  68 #if INCLUDE_JVMCI
  69 #include "jvmci/jvmciJavaClasses.hpp"
  70 #endif
  71 
  72 #define __ masm->
  73 
  74 #ifdef PRODUCT
  75 #define BLOCK_COMMENT(str) /* nothing */
  76 #else
  77 #define BLOCK_COMMENT(str) __ block_comment(str)
  78 #endif // PRODUCT
  79 
  80 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  81 
  82 class RegisterSaver {
  83   // Capture info about frame layout.  Layout offsets are in jint
  84   // units because compiler frame slots are jints.
  85 #define XSAVE_AREA_BEGIN 160
  86 #define XSAVE_AREA_YMM_BEGIN 576
  87 #define XSAVE_AREA_EGPRS 960
  88 #define XSAVE_AREA_OPMASK_BEGIN 1088
  89 #define XSAVE_AREA_ZMM_BEGIN 1152
  90 #define XSAVE_AREA_UPPERBANK 1664
  91 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  92 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  93 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  94 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  95 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  96   enum layout {
  97     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  98     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
  99     DEF_XMM_OFFS(0),
 100     DEF_XMM_OFFS(1),
 101     // 2..15 are implied in range usage
 102     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 103     DEF_YMM_OFFS(0),
 104     DEF_YMM_OFFS(1),
 105     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 106     r16H_off,
 107     r17_off, r17H_off,
 108     r18_off, r18H_off,
 109     r19_off, r19H_off,
 110     r20_off, r20H_off,
 111     r21_off, r21H_off,
 112     r22_off, r22H_off,
 113     r23_off, r23H_off,
 114     r24_off, r24H_off,
 115     r25_off, r25H_off,
 116     r26_off, r26H_off,
 117     r27_off, r27H_off,
 118     r28_off, r28H_off,
 119     r29_off, r29H_off,
 120     r30_off, r30H_off,
 121     r31_off, r31H_off,
 122     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 123     DEF_OPMASK_OFFS(0),
 124     DEF_OPMASK_OFFS(1),
 125     // 2..7 are implied in range usage
 126     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 127     DEF_ZMM_OFFS(0),
 128     DEF_ZMM_OFFS(1),
 129     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 130     DEF_ZMM_UPPER_OFFS(16),
 131     DEF_ZMM_UPPER_OFFS(17),
 132     // 18..31 are implied in range usage
 133     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 134     fpu_stateH_end,
 135     r15_off, r15H_off,
 136     r14_off, r14H_off,
 137     r13_off, r13H_off,
 138     r12_off, r12H_off,
 139     r11_off, r11H_off,
 140     r10_off, r10H_off,
 141     r9_off,  r9H_off,
 142     r8_off,  r8H_off,
 143     rdi_off, rdiH_off,
 144     rsi_off, rsiH_off,
 145     ignore_off, ignoreH_off,  // extra copy of rbp
 146     rsp_off, rspH_off,
 147     rbx_off, rbxH_off,
 148     rdx_off, rdxH_off,
 149     rcx_off, rcxH_off,
 150     rax_off, raxH_off,
 151     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 152     align_off, alignH_off,
 153     flags_off, flagsH_off,
 154     // The frame sender code expects that rbp will be in the "natural" place and
 155     // will override any oopMap setting for it. We must therefore force the layout
 156     // so that it agrees with the frame sender code.
 157     rbp_off, rbpH_off,        // copy of rbp we will restore
 158     return_off, returnH_off,  // slot for return address
 159     reg_save_size             // size in compiler stack slots
 160   };
 161 
 162  public:
 163   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 164   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 165 
 166   // Offsets into the register save area
 167   // Used by deoptimization when it is managing result register
 168   // values on its own
 169 
 170   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 171   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 172   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 173   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 174   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 175   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 176 
 177   // During deoptimization only the result registers need to be restored,
 178   // all the other values have already been extracted.
 179   static void restore_result_registers(MacroAssembler* masm);
 180 };
 181 
 182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 183   int off = 0;
 184   int num_xmm_regs = XMMRegister::available_xmm_registers();
 185 #if COMPILER2_OR_JVMCI
 186   if (save_wide_vectors && UseAVX == 0) {
 187     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 188   }
 189   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 190 #else
 191   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 192 #endif
 193 
 194   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 195   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 196   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 197   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 198   // CodeBlob frame size is in words.
 199   int frame_size_in_words = frame_size_in_bytes / wordSize;
 200   *total_frame_words = frame_size_in_words;
 201 
 202   // Save registers, fpu state, and flags.
 203   // We assume caller has already pushed the return address onto the
 204   // stack, so rsp is 8-byte aligned here.
 205   // We push rpb twice in this sequence because we want the real rbp
 206   // to be under the return like a normal enter.
 207 
 208   __ enter();          // rsp becomes 16-byte aligned here
 209   __ pushf();
 210   // Make sure rsp stays 16-byte aligned
 211   __ subq(rsp, 8);
 212   // Push CPU state in multiple of 16 bytes
 213   __ save_legacy_gprs();
 214   __ push_FPU_state();
 215 
 216 
 217   // push cpu state handles this on EVEX enabled targets
 218   if (save_wide_vectors) {
 219     // Save upper half of YMM registers(0..15)
 220     int base_addr = XSAVE_AREA_YMM_BEGIN;
 221     for (int n = 0; n < 16; n++) {
 222       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 223     }
 224     if (VM_Version::supports_evex()) {
 225       // Save upper half of ZMM registers(0..15)
 226       base_addr = XSAVE_AREA_ZMM_BEGIN;
 227       for (int n = 0; n < 16; n++) {
 228         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 229       }
 230       // Save full ZMM registers(16..num_xmm_regs)
 231       base_addr = XSAVE_AREA_UPPERBANK;
 232       off = 0;
 233       int vector_len = Assembler::AVX_512bit;
 234       for (int n = 16; n < num_xmm_regs; n++) {
 235         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 236       }
 237 #if COMPILER2_OR_JVMCI
 238       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 239       off = 0;
 240       for(int n = 0; n < KRegister::number_of_registers; n++) {
 241         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 242       }
 243 #endif
 244     }
 245   } else {
 246     if (VM_Version::supports_evex()) {
 247       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 248       int base_addr = XSAVE_AREA_UPPERBANK;
 249       off = 0;
 250       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 251       for (int n = 16; n < num_xmm_regs; n++) {
 252         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 253       }
 254 #if COMPILER2_OR_JVMCI
 255       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 256       off = 0;
 257       for(int n = 0; n < KRegister::number_of_registers; n++) {
 258         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 259       }
 260 #endif
 261     }
 262   }
 263 
 264 #if COMPILER2_OR_JVMCI
 265   if (UseAPX) {
 266       int base_addr = XSAVE_AREA_EGPRS;
 267       off = 0;
 268       for (int n = 16; n < Register::number_of_registers; n++) {
 269         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 270       }
 271   }
 272 #endif
 273 
 274   __ vzeroupper();
 275   if (frame::arg_reg_save_area_bytes != 0) {
 276     // Allocate argument register save area
 277     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 278   }
 279 
 280   // Set an oopmap for the call site.  This oopmap will map all
 281   // oop-registers and debug-info registers as callee-saved.  This
 282   // will allow deoptimization at this safepoint to find all possible
 283   // debug-info recordings, as well as let GC find all oops.
 284 
 285   OopMapSet *oop_maps = new OopMapSet();
 286   OopMap* map = new OopMap(frame_size_in_slots, 0);
 287 
 288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 289 
 290   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 291   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 294   // rbp location is known implicitly by the frame sender code, needs no oopmap
 295   // and the location where rbp was saved by is ignored
 296   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 297   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 306 
 307   if (UseAPX) {
 308     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 309     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 324   }
 325   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 326   // on EVEX enabled targets, we get it included in the xsave area
 327   off = xmm0_off;
 328   int delta = xmm1_off - off;
 329   for (int n = 0; n < 16; n++) {
 330     XMMRegister xmm_name = as_XMMRegister(n);
 331     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 332     off += delta;
 333   }
 334   if (UseAVX > 2) {
 335     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 336     off = zmm16_off;
 337     delta = zmm17_off - off;
 338     for (int n = 16; n < num_xmm_regs; n++) {
 339       XMMRegister zmm_name = as_XMMRegister(n);
 340       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 341       off += delta;
 342     }
 343   }
 344 
 345 #if COMPILER2_OR_JVMCI
 346   if (save_wide_vectors) {
 347     // Save upper half of YMM registers(0..15)
 348     off = ymm0_off;
 349     delta = ymm1_off - ymm0_off;
 350     for (int n = 0; n < 16; n++) {
 351       XMMRegister ymm_name = as_XMMRegister(n);
 352       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 353       off += delta;
 354     }
 355     if (VM_Version::supports_evex()) {
 356       // Save upper half of ZMM registers(0..15)
 357       off = zmm0_off;
 358       delta = zmm1_off - zmm0_off;
 359       for (int n = 0; n < 16; n++) {
 360         XMMRegister zmm_name = as_XMMRegister(n);
 361         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 362         off += delta;
 363       }
 364     }
 365   }
 366 #endif // COMPILER2_OR_JVMCI
 367 
 368   // %%% These should all be a waste but we'll keep things as they were for now
 369   if (true) {
 370     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 371     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 374     // rbp location is known implicitly by the frame sender code, needs no oopmap
 375     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 376     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 385     if (UseAPX) {
 386       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 387       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 402     }
 403     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 404     // on EVEX enabled targets, we get it included in the xsave area
 405     off = xmm0H_off;
 406     delta = xmm1H_off - off;
 407     for (int n = 0; n < 16; n++) {
 408       XMMRegister xmm_name = as_XMMRegister(n);
 409       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 410       off += delta;
 411     }
 412     if (UseAVX > 2) {
 413       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 414       off = zmm16H_off;
 415       delta = zmm17H_off - off;
 416       for (int n = 16; n < num_xmm_regs; n++) {
 417         XMMRegister zmm_name = as_XMMRegister(n);
 418         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 419         off += delta;
 420       }
 421     }
 422   }
 423 
 424   return map;
 425 }
 426 
 427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 428   int num_xmm_regs = XMMRegister::available_xmm_registers();
 429   if (frame::arg_reg_save_area_bytes != 0) {
 430     // Pop arg register save area
 431     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 432   }
 433 
 434 #if COMPILER2_OR_JVMCI
 435   if (restore_wide_vectors) {
 436     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 437     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 438   }
 439 #else
 440   assert(!restore_wide_vectors, "vectors are generated only by C2");
 441 #endif
 442 
 443   __ vzeroupper();
 444 
 445   // On EVEX enabled targets everything is handled in pop fpu state
 446   if (restore_wide_vectors) {
 447     // Restore upper half of YMM registers (0..15)
 448     int base_addr = XSAVE_AREA_YMM_BEGIN;
 449     for (int n = 0; n < 16; n++) {
 450       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 451     }
 452     if (VM_Version::supports_evex()) {
 453       // Restore upper half of ZMM registers (0..15)
 454       base_addr = XSAVE_AREA_ZMM_BEGIN;
 455       for (int n = 0; n < 16; n++) {
 456         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 457       }
 458       // Restore full ZMM registers(16..num_xmm_regs)
 459       base_addr = XSAVE_AREA_UPPERBANK;
 460       int vector_len = Assembler::AVX_512bit;
 461       int off = 0;
 462       for (int n = 16; n < num_xmm_regs; n++) {
 463         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 464       }
 465 #if COMPILER2_OR_JVMCI
 466       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 467       off = 0;
 468       for (int n = 0; n < KRegister::number_of_registers; n++) {
 469         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 470       }
 471 #endif
 472     }
 473   } else {
 474     if (VM_Version::supports_evex()) {
 475       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 476       int base_addr = XSAVE_AREA_UPPERBANK;
 477       int off = 0;
 478       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 479       for (int n = 16; n < num_xmm_regs; n++) {
 480         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 481       }
 482 #if COMPILER2_OR_JVMCI
 483       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 484       off = 0;
 485       for (int n = 0; n < KRegister::number_of_registers; n++) {
 486         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 487       }
 488 #endif
 489     }
 490   }
 491 
 492 #if COMPILER2_OR_JVMCI
 493   if (UseAPX) {
 494     int base_addr = XSAVE_AREA_EGPRS;
 495     int off = 0;
 496     for (int n = 16; n < Register::number_of_registers; n++) {
 497       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 498     }
 499   }
 500 #endif
 501 
 502   // Recover CPU state
 503   __ pop_FPU_state();
 504   __ restore_legacy_gprs();
 505   __ addq(rsp, 8);
 506   __ popf();
 507   // Get the rbp described implicitly by the calling convention (no oopMap)
 508   __ pop(rbp);
 509 }
 510 
 511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 512 
 513   // Just restore result register. Only used by deoptimization. By
 514   // now any callee save register that needs to be restored to a c2
 515   // caller of the deoptee has been extracted into the vframeArray
 516   // and will be stuffed into the c2i adapter we create for later
 517   // restoration so only result registers need to be restored here.
 518 
 519   // Restore fp result register
 520   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 521   // Restore integer result register
 522   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 523   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 524 
 525   // Pop all of the register save are off the stack except the return address
 526   __ addptr(rsp, return_offset_in_bytes());
 527 }
 528 
 529 // Is vector's size (in bytes) bigger than a size saved by default?
 530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 531 bool SharedRuntime::is_wide_vector(int size) {
 532   return size > 16;
 533 }
 534 
 535 // ---------------------------------------------------------------------------
 536 // Read the array of BasicTypes from a signature, and compute where the
 537 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 538 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 539 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 540 // as framesizes are fixed.
 541 // VMRegImpl::stack0 refers to the first slot 0(sp).
 542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 543 // Register up to Register::number_of_registers are the 64-bit
 544 // integer registers.
 545 
 546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 547 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 548 // units regardless of build. Of course for i486 there is no 64 bit build
 549 
 550 // The Java calling convention is a "shifted" version of the C ABI.
 551 // By skipping the first C ABI register we can call non-static jni methods
 552 // with small numbers of arguments without having to shuffle the arguments
 553 // at all. Since we control the java ABI we ought to at least get some
 554 // advantage out of it.
 555 
 556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 557                                            VMRegPair *regs,
 558                                            int total_args_passed) {
 559 
 560   // Create the mapping between argument positions and
 561   // registers.
 562   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 563     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 564   };
 565   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 566     j_farg0, j_farg1, j_farg2, j_farg3,
 567     j_farg4, j_farg5, j_farg6, j_farg7
 568   };
 569 
 570 
 571   uint int_args = 0;
 572   uint fp_args = 0;
 573   uint stk_args = 0;
 574 
 575   for (int i = 0; i < total_args_passed; i++) {
 576     switch (sig_bt[i]) {
 577     case T_BOOLEAN:
 578     case T_CHAR:
 579     case T_BYTE:
 580     case T_SHORT:
 581     case T_INT:
 582       if (int_args < Argument::n_int_register_parameters_j) {
 583         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 584       } else {
 585         stk_args = align_up(stk_args, 2);
 586         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 587         stk_args += 1;
 588       }
 589       break;
 590     case T_VOID:
 591       // halves of T_LONG or T_DOUBLE
 592       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 593       regs[i].set_bad();
 594       break;
 595     case T_LONG:
 596       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 597       // fall through
 598     case T_OBJECT:
 599     case T_ARRAY:
 600     case T_ADDRESS:
 601       if (int_args < Argument::n_int_register_parameters_j) {
 602         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 603       } else {
 604         stk_args = align_up(stk_args, 2);
 605         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 606         stk_args += 2;
 607       }
 608       break;
 609     case T_FLOAT:
 610       if (fp_args < Argument::n_float_register_parameters_j) {
 611         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 612       } else {
 613         stk_args = align_up(stk_args, 2);
 614         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 615         stk_args += 1;
 616       }
 617       break;
 618     case T_DOUBLE:
 619       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 620       if (fp_args < Argument::n_float_register_parameters_j) {
 621         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 622       } else {
 623         stk_args = align_up(stk_args, 2);
 624         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 625         stk_args += 2;
 626       }
 627       break;
 628     default:
 629       ShouldNotReachHere();
 630       break;
 631     }
 632   }
 633 
 634   return stk_args;
 635 }
 636 
 637 // Patch the callers callsite with entry to compiled code if it exists.
 638 static void patch_callers_callsite(MacroAssembler *masm) {
 639   Label L;
 640   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 641   __ jcc(Assembler::equal, L);
 642 
 643   // Save the current stack pointer
 644   __ mov(r13, rsp);
 645   // Schedule the branch target address early.
 646   // Call into the VM to patch the caller, then jump to compiled callee
 647   // rax isn't live so capture return address while we easily can
 648   __ movptr(rax, Address(rsp, 0));
 649 
 650   // align stack so push_CPU_state doesn't fault
 651   __ andptr(rsp, -(StackAlignmentInBytes));
 652   __ push_CPU_state();
 653   __ vzeroupper();
 654   // VM needs caller's callsite
 655   // VM needs target method
 656   // This needs to be a long call since we will relocate this adapter to
 657   // the codeBuffer and it may not reach
 658 
 659   // Allocate argument register save area
 660   if (frame::arg_reg_save_area_bytes != 0) {
 661     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 662   }
 663   __ mov(c_rarg0, rbx);
 664   __ mov(c_rarg1, rax);
 665   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 666 
 667   // De-allocate argument register save area
 668   if (frame::arg_reg_save_area_bytes != 0) {
 669     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 670   }
 671 
 672   __ vzeroupper();
 673   __ pop_CPU_state();
 674   // restore sp
 675   __ mov(rsp, r13);
 676   __ bind(L);
 677 }
 678 
 679 static void gen_c2i_adapter(MacroAssembler *masm,
 680                             int total_args_passed,
 681                             int comp_args_on_stack,
 682                             const BasicType *sig_bt,
 683                             const VMRegPair *regs,
 684                             Label& skip_fixup) {
 685   // Before we get into the guts of the C2I adapter, see if we should be here
 686   // at all.  We've come from compiled code and are attempting to jump to the
 687   // interpreter, which means the caller made a static call to get here
 688   // (vcalls always get a compiled target if there is one).  Check for a
 689   // compiled target.  If there is one, we need to patch the caller's call.
 690   patch_callers_callsite(masm);
 691 
 692   __ bind(skip_fixup);
 693 
 694   // Since all args are passed on the stack, total_args_passed *
 695   // Interpreter::stackElementSize is the space we need.
 696 
 697   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 698 
 699   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 700 
 701   // stack is aligned, keep it that way
 702   // This is not currently needed or enforced by the interpreter, but
 703   // we might as well conform to the ABI.
 704   extraspace = align_up(extraspace, 2*wordSize);
 705 
 706   // set senderSP value
 707   __ lea(r13, Address(rsp, wordSize));
 708 
 709 #ifdef ASSERT
 710   __ check_stack_alignment(r13, "sender stack not aligned");
 711 #endif
 712   if (extraspace > 0) {
 713     // Pop the return address
 714     __ pop(rax);
 715 
 716     __ subptr(rsp, extraspace);
 717 
 718     // Push the return address
 719     __ push(rax);
 720 
 721     // Account for the return address location since we store it first rather
 722     // than hold it in a register across all the shuffling
 723     extraspace += wordSize;
 724   }
 725 
 726 #ifdef ASSERT
 727   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 728 #endif
 729 
 730   // Now write the args into the outgoing interpreter space
 731   for (int i = 0; i < total_args_passed; i++) {
 732     if (sig_bt[i] == T_VOID) {
 733       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 734       continue;
 735     }
 736 
 737     // offset to start parameters
 738     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 739     int next_off = st_off - Interpreter::stackElementSize;
 740 
 741     // Say 4 args:
 742     // i   st_off
 743     // 0   32 T_LONG
 744     // 1   24 T_VOID
 745     // 2   16 T_OBJECT
 746     // 3    8 T_BOOL
 747     // -    0 return address
 748     //
 749     // However to make thing extra confusing. Because we can fit a long/double in
 750     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 751     // leaves one slot empty and only stores to a single slot. In this case the
 752     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 753 
 754     VMReg r_1 = regs[i].first();
 755     VMReg r_2 = regs[i].second();
 756     if (!r_1->is_valid()) {
 757       assert(!r_2->is_valid(), "");
 758       continue;
 759     }
 760     if (r_1->is_stack()) {
 761       // memory to memory use rax
 762       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 763       if (!r_2->is_valid()) {
 764         // sign extend??
 765         __ movl(rax, Address(rsp, ld_off));
 766         __ movptr(Address(rsp, st_off), rax);
 767 
 768       } else {
 769 
 770         __ movq(rax, Address(rsp, ld_off));
 771 
 772         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 773         // T_DOUBLE and T_LONG use two slots in the interpreter
 774         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 775           // ld_off == LSW, ld_off+wordSize == MSW
 776           // st_off == MSW, next_off == LSW
 777           __ movq(Address(rsp, next_off), rax);
 778 #ifdef ASSERT
 779           // Overwrite the unused slot with known junk
 780           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 781           __ movptr(Address(rsp, st_off), rax);
 782 #endif /* ASSERT */
 783         } else {
 784           __ movq(Address(rsp, st_off), rax);
 785         }
 786       }
 787     } else if (r_1->is_Register()) {
 788       Register r = r_1->as_Register();
 789       if (!r_2->is_valid()) {
 790         // must be only an int (or less ) so move only 32bits to slot
 791         // why not sign extend??
 792         __ movl(Address(rsp, st_off), r);
 793       } else {
 794         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 795         // T_DOUBLE and T_LONG use two slots in the interpreter
 796         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 797           // long/double in gpr
 798 #ifdef ASSERT
 799           // Overwrite the unused slot with known junk
 800           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 801           __ movptr(Address(rsp, st_off), rax);
 802 #endif /* ASSERT */
 803           __ movq(Address(rsp, next_off), r);
 804         } else {
 805           __ movptr(Address(rsp, st_off), r);
 806         }
 807       }
 808     } else {
 809       assert(r_1->is_XMMRegister(), "");
 810       if (!r_2->is_valid()) {
 811         // only a float use just part of the slot
 812         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 813       } else {
 814 #ifdef ASSERT
 815         // Overwrite the unused slot with known junk
 816         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 817         __ movptr(Address(rsp, st_off), rax);
 818 #endif /* ASSERT */
 819         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 820       }
 821     }
 822   }
 823 
 824   // Schedule the branch target address early.
 825   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 826   __ jmp(rcx);
 827 }
 828 
 829 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 830                                     int total_args_passed,
 831                                     int comp_args_on_stack,
 832                                     const BasicType *sig_bt,
 833                                     const VMRegPair *regs) {
 834 
 835   // Note: r13 contains the senderSP on entry. We must preserve it since
 836   // we may do a i2c -> c2i transition if we lose a race where compiled
 837   // code goes non-entrant while we get args ready.
 838   // In addition we use r13 to locate all the interpreter args as
 839   // we must align the stack to 16 bytes on an i2c entry else we
 840   // lose alignment we expect in all compiled code and register
 841   // save code can segv when fxsave instructions find improperly
 842   // aligned stack pointer.
 843 
 844   // Adapters can be frameless because they do not require the caller
 845   // to perform additional cleanup work, such as correcting the stack pointer.
 846   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 847   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 848   // even if a callee has modified the stack pointer.
 849   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 850   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 851   // up via the senderSP register).
 852   // In other words, if *either* the caller or callee is interpreted, we can
 853   // get the stack pointer repaired after a call.
 854   // This is why c2i and i2c adapters cannot be indefinitely composed.
 855   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 856   // both caller and callee would be compiled methods, and neither would
 857   // clean up the stack pointer changes performed by the two adapters.
 858   // If this happens, control eventually transfers back to the compiled
 859   // caller, but with an uncorrected stack, causing delayed havoc.
 860 
 861   // Must preserve original SP for loading incoming arguments because
 862   // we need to align the outgoing SP for compiled code.
 863   __ movptr(r11, rsp);
 864 
 865   // Pick up the return address
 866   __ pop(rax);
 867 
 868   // Convert 4-byte c2 stack slots to words.
 869   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 870 
 871   if (comp_args_on_stack) {
 872     __ subptr(rsp, comp_words_on_stack * wordSize);
 873   }
 874 
 875   // Ensure compiled code always sees stack at proper alignment
 876   __ andptr(rsp, -16);
 877 
 878   // push the return address and misalign the stack that youngest frame always sees
 879   // as far as the placement of the call instruction
 880   __ push(rax);
 881 
 882   // Put saved SP in another register
 883   const Register saved_sp = rax;
 884   __ movptr(saved_sp, r11);
 885 
 886   // Will jump to the compiled code just as if compiled code was doing it.
 887   // Pre-load the register-jump target early, to schedule it better.
 888   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 889 
 890 #if INCLUDE_JVMCI
 891   if (EnableJVMCI) {
 892     // check if this call should be routed towards a specific entry point
 893     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 894     Label no_alternative_target;
 895     __ jcc(Assembler::equal, no_alternative_target);
 896     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 897     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 898     __ bind(no_alternative_target);
 899   }
 900 #endif // INCLUDE_JVMCI
 901 
 902   // Now generate the shuffle code.  Pick up all register args and move the
 903   // rest through the floating point stack top.
 904   for (int i = 0; i < total_args_passed; i++) {
 905     if (sig_bt[i] == T_VOID) {
 906       // Longs and doubles are passed in native word order, but misaligned
 907       // in the 32-bit build.
 908       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 909       continue;
 910     }
 911 
 912     // Pick up 0, 1 or 2 words from SP+offset.
 913 
 914     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 915             "scrambled load targets?");
 916     // Load in argument order going down.
 917     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 918     // Point to interpreter value (vs. tag)
 919     int next_off = ld_off - Interpreter::stackElementSize;
 920     //
 921     //
 922     //
 923     VMReg r_1 = regs[i].first();
 924     VMReg r_2 = regs[i].second();
 925     if (!r_1->is_valid()) {
 926       assert(!r_2->is_valid(), "");
 927       continue;
 928     }
 929     if (r_1->is_stack()) {
 930       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 931       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 932 
 933       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 934       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 935       // will be generated.
 936       if (!r_2->is_valid()) {
 937         // sign extend???
 938         __ movl(r13, Address(saved_sp, ld_off));
 939         __ movptr(Address(rsp, st_off), r13);
 940       } else {
 941         //
 942         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 943         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 944         // So we must adjust where to pick up the data to match the interpreter.
 945         //
 946         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 947         // are accessed as negative so LSW is at LOW address
 948 
 949         // ld_off is MSW so get LSW
 950         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 951                            next_off : ld_off;
 952         __ movq(r13, Address(saved_sp, offset));
 953         // st_off is LSW (i.e. reg.first())
 954         __ movq(Address(rsp, st_off), r13);
 955       }
 956     } else if (r_1->is_Register()) {  // Register argument
 957       Register r = r_1->as_Register();
 958       assert(r != rax, "must be different");
 959       if (r_2->is_valid()) {
 960         //
 961         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 962         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 963         // So we must adjust where to pick up the data to match the interpreter.
 964 
 965         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 966                            next_off : ld_off;
 967 
 968         // this can be a misaligned move
 969         __ movq(r, Address(saved_sp, offset));
 970       } else {
 971         // sign extend and use a full word?
 972         __ movl(r, Address(saved_sp, ld_off));
 973       }
 974     } else {
 975       if (!r_2->is_valid()) {
 976         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 977       } else {
 978         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 979       }
 980     }
 981   }
 982 
 983   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 984 
 985   // 6243940 We might end up in handle_wrong_method if
 986   // the callee is deoptimized as we race thru here. If that
 987   // happens we don't want to take a safepoint because the
 988   // caller frame will look interpreted and arguments are now
 989   // "compiled" so it is much better to make this transition
 990   // invisible to the stack walking code. Unfortunately if
 991   // we try and find the callee by normal means a safepoint
 992   // is possible. So we stash the desired callee in the thread
 993   // and the vm will find there should this case occur.
 994 
 995   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 996 
 997   // put Method* where a c2i would expect should we end up there
 998   // only needed because eof c2 resolve stubs return Method* as a result in
 999   // rax
1000   __ mov(rax, rbx);
1001   __ jmp(r11);
1002 }
1003 
1004 // ---------------------------------------------------------------
1005 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1006                                             int total_args_passed,
1007                                             int comp_args_on_stack,
1008                                             const BasicType *sig_bt,
1009                                             const VMRegPair *regs,
1010                                             address entry_address[AdapterBlob::ENTRY_COUNT]) {
1011   entry_address[AdapterBlob::I2C] = __ pc();
1012 
1013   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1014 
1015   // -------------------------------------------------------------------------
1016   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1017   // to the interpreter.  The args start out packed in the compiled layout.  They
1018   // need to be unpacked into the interpreter layout.  This will almost always
1019   // require some stack space.  We grow the current (compiled) stack, then repack
1020   // the args.  We  finally end in a jump to the generic interpreter entry point.
1021   // On exit from the interpreter, the interpreter will restore our SP (lest the
1022   // compiled code, which relies solely on SP and not RBP, get sick).
1023 
1024   entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1025   Label skip_fixup;
1026 
1027   Register data = rax;
1028   Register receiver = j_rarg0;
1029   Register temp = rbx;
1030 
1031   {
1032     __ ic_check(1 /* end_alignment */);
1033     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1034     // Method might have been compiled since the call site was patched to
1035     // interpreted if that is the case treat it as a miss so we can get
1036     // the call site corrected.
1037     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1038     __ jcc(Assembler::equal, skip_fixup);
1039     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1040   }
1041 
1042   entry_address[AdapterBlob::C2I] = __ pc();
1043 
1044   // Class initialization barrier for static methods
1045   entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1046   assert(VM_Version::supports_fast_class_init_checks(), "sanity");
1047   Label L_skip_barrier;
1048   Register method = rbx;
1049 
1050   // Bypass the barrier for non-static methods
1051   Register flags = rscratch1;
1052   __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1053   __ testl(flags, JVM_ACC_STATIC);
1054   __ jcc(Assembler::zero, L_skip_barrier); // non-static
1055 
1056   Register klass = rscratch1;
1057   __ load_method_holder(klass, method);
1058   __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1059 
1060   __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1061 
1062   __ bind(L_skip_barrier);
1063   entry_address[AdapterBlob::C2I_No_Clinit_Check] = __ pc();
1064 
1065   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1066   bs->c2i_entry_barrier(masm);
1067 
1068   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1069   return;
1070 }
1071 
1072 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1073                                          VMRegPair *regs,
1074                                          int total_args_passed) {
1075 
1076 // We return the amount of VMRegImpl stack slots we need to reserve for all
1077 // the arguments NOT counting out_preserve_stack_slots.
1078 
1079 // NOTE: These arrays will have to change when c1 is ported
1080 #ifdef _WIN64
1081     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1082       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1083     };
1084     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1085       c_farg0, c_farg1, c_farg2, c_farg3
1086     };
1087 #else
1088     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1089       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1090     };
1091     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1092       c_farg0, c_farg1, c_farg2, c_farg3,
1093       c_farg4, c_farg5, c_farg6, c_farg7
1094     };
1095 #endif // _WIN64
1096 
1097 
1098     uint int_args = 0;
1099     uint fp_args = 0;
1100     uint stk_args = 0; // inc by 2 each time
1101 
1102     for (int i = 0; i < total_args_passed; i++) {
1103       switch (sig_bt[i]) {
1104       case T_BOOLEAN:
1105       case T_CHAR:
1106       case T_BYTE:
1107       case T_SHORT:
1108       case T_INT:
1109         if (int_args < Argument::n_int_register_parameters_c) {
1110           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1111 #ifdef _WIN64
1112           fp_args++;
1113           // Allocate slots for callee to stuff register args the stack.
1114           stk_args += 2;
1115 #endif
1116         } else {
1117           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1118           stk_args += 2;
1119         }
1120         break;
1121       case T_LONG:
1122         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1123         // fall through
1124       case T_OBJECT:
1125       case T_ARRAY:
1126       case T_ADDRESS:
1127       case T_METADATA:
1128         if (int_args < Argument::n_int_register_parameters_c) {
1129           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1130 #ifdef _WIN64
1131           fp_args++;
1132           stk_args += 2;
1133 #endif
1134         } else {
1135           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1136           stk_args += 2;
1137         }
1138         break;
1139       case T_FLOAT:
1140         if (fp_args < Argument::n_float_register_parameters_c) {
1141           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1142 #ifdef _WIN64
1143           int_args++;
1144           // Allocate slots for callee to stuff register args the stack.
1145           stk_args += 2;
1146 #endif
1147         } else {
1148           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1149           stk_args += 2;
1150         }
1151         break;
1152       case T_DOUBLE:
1153         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1154         if (fp_args < Argument::n_float_register_parameters_c) {
1155           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1156 #ifdef _WIN64
1157           int_args++;
1158           // Allocate slots for callee to stuff register args the stack.
1159           stk_args += 2;
1160 #endif
1161         } else {
1162           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1163           stk_args += 2;
1164         }
1165         break;
1166       case T_VOID: // Halves of longs and doubles
1167         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1168         regs[i].set_bad();
1169         break;
1170       default:
1171         ShouldNotReachHere();
1172         break;
1173       }
1174     }
1175 #ifdef _WIN64
1176   // windows abi requires that we always allocate enough stack space
1177   // for 4 64bit registers to be stored down.
1178   if (stk_args < 8) {
1179     stk_args = 8;
1180   }
1181 #endif // _WIN64
1182 
1183   return stk_args;
1184 }
1185 
1186 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1187                                              uint num_bits,
1188                                              uint total_args_passed) {
1189   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1190          "only certain vector sizes are supported for now");
1191 
1192   static const XMMRegister VEC_ArgReg[32] = {
1193      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1194      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1195     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1196     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1197   };
1198 
1199   uint stk_args = 0;
1200   uint fp_args = 0;
1201 
1202   for (uint i = 0; i < total_args_passed; i++) {
1203     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1204     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1205     regs[i].set_pair(vmreg->next(next_val), vmreg);
1206   }
1207 
1208   return stk_args;
1209 }
1210 
1211 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1212   // We always ignore the frame_slots arg and just use the space just below frame pointer
1213   // which by this time is free to use
1214   switch (ret_type) {
1215   case T_FLOAT:
1216     __ movflt(Address(rbp, -wordSize), xmm0);
1217     break;
1218   case T_DOUBLE:
1219     __ movdbl(Address(rbp, -wordSize), xmm0);
1220     break;
1221   case T_VOID:  break;
1222   default: {
1223     __ movptr(Address(rbp, -wordSize), rax);
1224     }
1225   }
1226 }
1227 
1228 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1229   // We always ignore the frame_slots arg and just use the space just below frame pointer
1230   // which by this time is free to use
1231   switch (ret_type) {
1232   case T_FLOAT:
1233     __ movflt(xmm0, Address(rbp, -wordSize));
1234     break;
1235   case T_DOUBLE:
1236     __ movdbl(xmm0, Address(rbp, -wordSize));
1237     break;
1238   case T_VOID:  break;
1239   default: {
1240     __ movptr(rax, Address(rbp, -wordSize));
1241     }
1242   }
1243 }
1244 
1245 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1246     for ( int i = first_arg ; i < arg_count ; i++ ) {
1247       if (args[i].first()->is_Register()) {
1248         __ push(args[i].first()->as_Register());
1249       } else if (args[i].first()->is_XMMRegister()) {
1250         __ subptr(rsp, 2*wordSize);
1251         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1252       }
1253     }
1254 }
1255 
1256 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1257     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1258       if (args[i].first()->is_Register()) {
1259         __ pop(args[i].first()->as_Register());
1260       } else if (args[i].first()->is_XMMRegister()) {
1261         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1262         __ addptr(rsp, 2*wordSize);
1263       }
1264     }
1265 }
1266 
1267 static void verify_oop_args(MacroAssembler* masm,
1268                             const methodHandle& method,
1269                             const BasicType* sig_bt,
1270                             const VMRegPair* regs) {
1271   Register temp_reg = rbx;  // not part of any compiled calling seq
1272   if (VerifyOops) {
1273     for (int i = 0; i < method->size_of_parameters(); i++) {
1274       if (is_reference_type(sig_bt[i])) {
1275         VMReg r = regs[i].first();
1276         assert(r->is_valid(), "bad oop arg");
1277         if (r->is_stack()) {
1278           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1279           __ verify_oop(temp_reg);
1280         } else {
1281           __ verify_oop(r->as_Register());
1282         }
1283       }
1284     }
1285   }
1286 }
1287 
1288 static void check_continuation_enter_argument(VMReg actual_vmreg,
1289                                               Register expected_reg,
1290                                               const char* name) {
1291   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1292   assert(actual_vmreg->as_Register() == expected_reg,
1293          "%s is in unexpected register: %s instead of %s",
1294          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1295 }
1296 
1297 
1298 //---------------------------- continuation_enter_setup ---------------------------
1299 //
1300 // Arguments:
1301 //   None.
1302 //
1303 // Results:
1304 //   rsp: pointer to blank ContinuationEntry
1305 //
1306 // Kills:
1307 //   rax
1308 //
1309 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1310   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1311   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1312   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1313 
1314   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1315   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1316 
1317   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1318   OopMap* map = new OopMap(frame_size, 0);
1319 
1320   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1321   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1322   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1323 
1324   return map;
1325 }
1326 
1327 //---------------------------- fill_continuation_entry ---------------------------
1328 //
1329 // Arguments:
1330 //   rsp: pointer to blank Continuation entry
1331 //   reg_cont_obj: pointer to the continuation
1332 //   reg_flags: flags
1333 //
1334 // Results:
1335 //   rsp: pointer to filled out ContinuationEntry
1336 //
1337 // Kills:
1338 //   rax
1339 //
1340 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1341   assert_different_registers(rax, reg_cont_obj, reg_flags);
1342 #ifdef ASSERT
1343   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1344 #endif
1345   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1346   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1347   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1348   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1349   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1350 
1351   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1352   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1353 
1354   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1355 }
1356 
1357 //---------------------------- continuation_enter_cleanup ---------------------------
1358 //
1359 // Arguments:
1360 //   rsp: pointer to the ContinuationEntry
1361 //
1362 // Results:
1363 //   rsp: pointer to the spilled rbp in the entry frame
1364 //
1365 // Kills:
1366 //   rbx
1367 //
1368 static void continuation_enter_cleanup(MacroAssembler* masm) {
1369 #ifdef ASSERT
1370   Label L_good_sp;
1371   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1372   __ jcc(Assembler::equal, L_good_sp);
1373   __ stop("Incorrect rsp at continuation_enter_cleanup");
1374   __ bind(L_good_sp);
1375 #endif
1376   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1377   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1378   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1379   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1380   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1381 }
1382 
1383 static void gen_continuation_enter(MacroAssembler* masm,
1384                                    const VMRegPair* regs,
1385                                    int& exception_offset,
1386                                    OopMapSet* oop_maps,
1387                                    int& frame_complete,
1388                                    int& stack_slots,
1389                                    int& interpreted_entry_offset,
1390                                    int& compiled_entry_offset) {
1391 
1392   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1393   int pos_cont_obj   = 0;
1394   int pos_is_cont    = 1;
1395   int pos_is_virtual = 2;
1396 
1397   // The platform-specific calling convention may present the arguments in various registers.
1398   // To simplify the rest of the code, we expect the arguments to reside at these known
1399   // registers, and we additionally check the placement here in case calling convention ever
1400   // changes.
1401   Register reg_cont_obj   = c_rarg1;
1402   Register reg_is_cont    = c_rarg2;
1403   Register reg_is_virtual = c_rarg3;
1404 
1405   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1406   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1407   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1408 
1409   // Utility methods kill rax, make sure there are no collisions
1410   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1411 
1412   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1413                          relocInfo::static_call_type);
1414 
1415   address start = __ pc();
1416 
1417   Label L_thaw, L_exit;
1418 
1419   // i2i entry used at interp_only_mode only
1420   interpreted_entry_offset = __ pc() - start;
1421   {
1422 #ifdef ASSERT
1423     Label is_interp_only;
1424     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1425     __ jcc(Assembler::notEqual, is_interp_only);
1426     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1427     __ bind(is_interp_only);
1428 #endif
1429 
1430     __ pop(rax); // return address
1431     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1432     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1433     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1434     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1435     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1436     __ push(rax); // return address
1437     __ push_cont_fastpath();
1438 
1439     __ enter();
1440 
1441     stack_slots = 2; // will be adjusted in setup
1442     OopMap* map = continuation_enter_setup(masm, stack_slots);
1443     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1444     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1445 
1446     __ verify_oop(reg_cont_obj);
1447 
1448     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1449 
1450     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1451     __ testptr(reg_is_cont, reg_is_cont);
1452     __ jcc(Assembler::notZero, L_thaw);
1453 
1454     // --- Resolve path
1455 
1456     // Make sure the call is patchable
1457     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1458     // Emit stub for static call
1459     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1460     if (stub == nullptr) {
1461       fatal("CodeCache is full at gen_continuation_enter");
1462     }
1463     __ call(resolve);
1464     oop_maps->add_gc_map(__ pc() - start, map);
1465     __ post_call_nop();
1466 
1467     __ jmp(L_exit);
1468   }
1469 
1470   // compiled entry
1471   __ align(CodeEntryAlignment);
1472   compiled_entry_offset = __ pc() - start;
1473   __ enter();
1474 
1475   stack_slots = 2; // will be adjusted in setup
1476   OopMap* map = continuation_enter_setup(masm, stack_slots);
1477 
1478   // Frame is now completed as far as size and linkage.
1479   frame_complete = __ pc() - start;
1480 
1481   __ verify_oop(reg_cont_obj);
1482 
1483   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1484 
1485   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1486   __ testptr(reg_is_cont, reg_is_cont);
1487   __ jccb(Assembler::notZero, L_thaw);
1488 
1489   // --- call Continuation.enter(Continuation c, boolean isContinue)
1490 
1491   // Make sure the call is patchable
1492   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1493 
1494   // Emit stub for static call
1495   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1496   if (stub == nullptr) {
1497     fatal("CodeCache is full at gen_continuation_enter");
1498   }
1499 
1500   // The call needs to be resolved. There's a special case for this in
1501   // SharedRuntime::find_callee_info_helper() which calls
1502   // LinkResolver::resolve_continuation_enter() which resolves the call to
1503   // Continuation.enter(Continuation c, boolean isContinue).
1504   __ call(resolve);
1505 
1506   oop_maps->add_gc_map(__ pc() - start, map);
1507   __ post_call_nop();
1508 
1509   __ jmpb(L_exit);
1510 
1511   // --- Thawing path
1512 
1513   __ bind(L_thaw);
1514 
1515   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1516   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1517 
1518   ContinuationEntry::_return_pc_offset = __ pc() - start;
1519   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1520   __ post_call_nop();
1521 
1522   // --- Normal exit (resolve/thawing)
1523 
1524   __ bind(L_exit);
1525   ContinuationEntry::_cleanup_offset = __ pc() - start;
1526   continuation_enter_cleanup(masm);
1527   __ pop(rbp);
1528   __ ret(0);
1529 
1530   // --- Exception handling path
1531 
1532   exception_offset = __ pc() - start;
1533 
1534   continuation_enter_cleanup(masm);
1535   __ pop(rbp);
1536 
1537   __ movptr(c_rarg0, r15_thread);
1538   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1539 
1540   // rax still holds the original exception oop, save it before the call
1541   __ push(rax);
1542 
1543   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1544   __ movptr(rbx, rax);
1545 
1546   // Continue at exception handler:
1547   //   rax: exception oop
1548   //   rbx: exception handler
1549   //   rdx: exception pc
1550   __ pop(rax);
1551   __ verify_oop(rax);
1552   __ pop(rdx);
1553   __ jmp(rbx);
1554 }
1555 
1556 static void gen_continuation_yield(MacroAssembler* masm,
1557                                    const VMRegPair* regs,
1558                                    OopMapSet* oop_maps,
1559                                    int& frame_complete,
1560                                    int& stack_slots,
1561                                    int& compiled_entry_offset) {
1562   enum layout {
1563     rbp_off,
1564     rbpH_off,
1565     return_off,
1566     return_off2,
1567     framesize // inclusive of return address
1568   };
1569   stack_slots = framesize /  VMRegImpl::slots_per_word;
1570   assert(stack_slots == 2, "recheck layout");
1571 
1572   address start = __ pc();
1573   compiled_entry_offset = __ pc() - start;
1574   __ enter();
1575   address the_pc = __ pc();
1576 
1577   frame_complete = the_pc - start;
1578 
1579   // This nop must be exactly at the PC we push into the frame info.
1580   // We use this nop for fast CodeBlob lookup, associate the OopMap
1581   // with it right away.
1582   __ post_call_nop();
1583   OopMap* map = new OopMap(framesize, 1);
1584   oop_maps->add_gc_map(frame_complete, map);
1585 
1586   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1587   __ movptr(c_rarg0, r15_thread);
1588   __ movptr(c_rarg1, rsp);
1589   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1590   __ reset_last_Java_frame(true);
1591 
1592   Label L_pinned;
1593 
1594   __ testptr(rax, rax);
1595   __ jcc(Assembler::notZero, L_pinned);
1596 
1597   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1598   continuation_enter_cleanup(masm);
1599   __ pop(rbp);
1600   __ ret(0);
1601 
1602   __ bind(L_pinned);
1603 
1604   // Pinned, return to caller
1605 
1606   // handle pending exception thrown by freeze
1607   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1608   Label ok;
1609   __ jcc(Assembler::equal, ok);
1610   __ leave();
1611   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1612   __ bind(ok);
1613 
1614   __ leave();
1615   __ ret(0);
1616 }
1617 
1618 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1619   ::continuation_enter_cleanup(masm);
1620 }
1621 
1622 static void gen_special_dispatch(MacroAssembler* masm,
1623                                  const methodHandle& method,
1624                                  const BasicType* sig_bt,
1625                                  const VMRegPair* regs) {
1626   verify_oop_args(masm, method, sig_bt, regs);
1627   vmIntrinsics::ID iid = method->intrinsic_id();
1628 
1629   // Now write the args into the outgoing interpreter space
1630   bool     has_receiver   = false;
1631   Register receiver_reg   = noreg;
1632   int      member_arg_pos = -1;
1633   Register member_reg     = noreg;
1634   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1635   if (ref_kind != 0) {
1636     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1637     member_reg = rbx;  // known to be free at this point
1638     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1639   } else if (iid == vmIntrinsics::_invokeBasic) {
1640     has_receiver = true;
1641   } else if (iid == vmIntrinsics::_linkToNative) {
1642     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1643     member_reg = rbx;  // known to be free at this point
1644   } else {
1645     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1646   }
1647 
1648   if (member_reg != noreg) {
1649     // Load the member_arg into register, if necessary.
1650     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1651     VMReg r = regs[member_arg_pos].first();
1652     if (r->is_stack()) {
1653       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1654     } else {
1655       // no data motion is needed
1656       member_reg = r->as_Register();
1657     }
1658   }
1659 
1660   if (has_receiver) {
1661     // Make sure the receiver is loaded into a register.
1662     assert(method->size_of_parameters() > 0, "oob");
1663     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1664     VMReg r = regs[0].first();
1665     assert(r->is_valid(), "bad receiver arg");
1666     if (r->is_stack()) {
1667       // Porting note:  This assumes that compiled calling conventions always
1668       // pass the receiver oop in a register.  If this is not true on some
1669       // platform, pick a temp and load the receiver from stack.
1670       fatal("receiver always in a register");
1671       receiver_reg = j_rarg0;  // known to be free at this point
1672       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1673     } else {
1674       // no data motion is needed
1675       receiver_reg = r->as_Register();
1676     }
1677   }
1678 
1679   // Figure out which address we are really jumping to:
1680   MethodHandles::generate_method_handle_dispatch(masm, iid,
1681                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1682 }
1683 
1684 // ---------------------------------------------------------------------------
1685 // Generate a native wrapper for a given method.  The method takes arguments
1686 // in the Java compiled code convention, marshals them to the native
1687 // convention (handlizes oops, etc), transitions to native, makes the call,
1688 // returns to java state (possibly blocking), unhandlizes any result and
1689 // returns.
1690 //
1691 // Critical native functions are a shorthand for the use of
1692 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1693 // functions.  The wrapper is expected to unpack the arguments before
1694 // passing them to the callee. Critical native functions leave the state _in_Java,
1695 // since they cannot stop for GC.
1696 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1697 // block and the check for pending exceptions it's impossible for them
1698 // to be thrown.
1699 //
1700 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1701                                                 const methodHandle& method,
1702                                                 int compile_id,
1703                                                 BasicType* in_sig_bt,
1704                                                 VMRegPair* in_regs,
1705                                                 BasicType ret_type) {
1706   if (method->is_continuation_native_intrinsic()) {
1707     int exception_offset = -1;
1708     OopMapSet* oop_maps = new OopMapSet();
1709     int frame_complete = -1;
1710     int stack_slots = -1;
1711     int interpreted_entry_offset = -1;
1712     int vep_offset = -1;
1713     if (method->is_continuation_enter_intrinsic()) {
1714       gen_continuation_enter(masm,
1715                              in_regs,
1716                              exception_offset,
1717                              oop_maps,
1718                              frame_complete,
1719                              stack_slots,
1720                              interpreted_entry_offset,
1721                              vep_offset);
1722     } else if (method->is_continuation_yield_intrinsic()) {
1723       gen_continuation_yield(masm,
1724                              in_regs,
1725                              oop_maps,
1726                              frame_complete,
1727                              stack_slots,
1728                              vep_offset);
1729     } else {
1730       guarantee(false, "Unknown Continuation native intrinsic");
1731     }
1732 
1733 #ifdef ASSERT
1734     if (method->is_continuation_enter_intrinsic()) {
1735       assert(interpreted_entry_offset != -1, "Must be set");
1736       assert(exception_offset != -1,         "Must be set");
1737     } else {
1738       assert(interpreted_entry_offset == -1, "Must be unset");
1739       assert(exception_offset == -1,         "Must be unset");
1740     }
1741     assert(frame_complete != -1,    "Must be set");
1742     assert(stack_slots != -1,       "Must be set");
1743     assert(vep_offset != -1,        "Must be set");
1744 #endif
1745 
1746     __ flush();
1747     nmethod* nm = nmethod::new_native_nmethod(method,
1748                                               compile_id,
1749                                               masm->code(),
1750                                               vep_offset,
1751                                               frame_complete,
1752                                               stack_slots,
1753                                               in_ByteSize(-1),
1754                                               in_ByteSize(-1),
1755                                               oop_maps,
1756                                               exception_offset);
1757     if (nm == nullptr) return nm;
1758     if (method->is_continuation_enter_intrinsic()) {
1759       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1760     } else if (method->is_continuation_yield_intrinsic()) {
1761       _cont_doYield_stub = nm;
1762     }
1763     return nm;
1764   }
1765 
1766   if (method->is_method_handle_intrinsic()) {
1767     vmIntrinsics::ID iid = method->intrinsic_id();
1768     intptr_t start = (intptr_t)__ pc();
1769     int vep_offset = ((intptr_t)__ pc()) - start;
1770     gen_special_dispatch(masm,
1771                          method,
1772                          in_sig_bt,
1773                          in_regs);
1774     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1775     __ flush();
1776     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1777     return nmethod::new_native_nmethod(method,
1778                                        compile_id,
1779                                        masm->code(),
1780                                        vep_offset,
1781                                        frame_complete,
1782                                        stack_slots / VMRegImpl::slots_per_word,
1783                                        in_ByteSize(-1),
1784                                        in_ByteSize(-1),
1785                                        nullptr);
1786   }
1787   address native_func = method->native_function();
1788   assert(native_func != nullptr, "must have function");
1789 
1790   // An OopMap for lock (and class if static)
1791   OopMapSet *oop_maps = new OopMapSet();
1792   intptr_t start = (intptr_t)__ pc();
1793 
1794   // We have received a description of where all the java arg are located
1795   // on entry to the wrapper. We need to convert these args to where
1796   // the jni function will expect them. To figure out where they go
1797   // we convert the java signature to a C signature by inserting
1798   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1799 
1800   const int total_in_args = method->size_of_parameters();
1801   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1802 
1803   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1804   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1805 
1806   int argc = 0;
1807   out_sig_bt[argc++] = T_ADDRESS;
1808   if (method->is_static()) {
1809     out_sig_bt[argc++] = T_OBJECT;
1810   }
1811 
1812   for (int i = 0; i < total_in_args ; i++ ) {
1813     out_sig_bt[argc++] = in_sig_bt[i];
1814   }
1815 
1816   // Now figure out where the args must be stored and how much stack space
1817   // they require.
1818   int out_arg_slots;
1819   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1820 
1821   // Compute framesize for the wrapper.  We need to handlize all oops in
1822   // incoming registers
1823 
1824   // Calculate the total number of stack slots we will need.
1825 
1826   // First count the abi requirement plus all of the outgoing args
1827   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1828 
1829   // Now the space for the inbound oop handle area
1830   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1831 
1832   int oop_handle_offset = stack_slots;
1833   stack_slots += total_save_slots;
1834 
1835   // Now any space we need for handlizing a klass if static method
1836 
1837   int klass_slot_offset = 0;
1838   int klass_offset = -1;
1839   int lock_slot_offset = 0;
1840   bool is_static = false;
1841 
1842   if (method->is_static()) {
1843     klass_slot_offset = stack_slots;
1844     stack_slots += VMRegImpl::slots_per_word;
1845     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1846     is_static = true;
1847   }
1848 
1849   // Plus a lock if needed
1850 
1851   if (method->is_synchronized()) {
1852     lock_slot_offset = stack_slots;
1853     stack_slots += VMRegImpl::slots_per_word;
1854   }
1855 
1856   // Now a place (+2) to save return values or temp during shuffling
1857   // + 4 for return address (which we own) and saved rbp
1858   stack_slots += 6;
1859 
1860   // Ok The space we have allocated will look like:
1861   //
1862   //
1863   // FP-> |                     |
1864   //      |---------------------|
1865   //      | 2 slots for moves   |
1866   //      |---------------------|
1867   //      | lock box (if sync)  |
1868   //      |---------------------| <- lock_slot_offset
1869   //      | klass (if static)   |
1870   //      |---------------------| <- klass_slot_offset
1871   //      | oopHandle area      |
1872   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1873   //      | outbound memory     |
1874   //      | based arguments     |
1875   //      |                     |
1876   //      |---------------------|
1877   //      |                     |
1878   // SP-> | out_preserved_slots |
1879   //
1880   //
1881 
1882 
1883   // Now compute actual number of stack words we need rounding to make
1884   // stack properly aligned.
1885   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1886 
1887   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1888 
1889   // First thing make an ic check to see if we should even be here
1890 
1891   // We are free to use all registers as temps without saving them and
1892   // restoring them except rbp. rbp is the only callee save register
1893   // as far as the interpreter and the compiler(s) are concerned.
1894 
1895   const Register receiver = j_rarg0;
1896 
1897   Label exception_pending;
1898 
1899   assert_different_registers(receiver, rscratch1, rscratch2);
1900   __ verify_oop(receiver);
1901   __ ic_check(8 /* end_alignment */);
1902 
1903   int vep_offset = ((intptr_t)__ pc()) - start;
1904 
1905   if (method->needs_clinit_barrier()) {
1906     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
1907     Label L_skip_barrier;
1908     Register klass = r10;
1909     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1910     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1911 
1912     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1913 
1914     __ bind(L_skip_barrier);
1915   }
1916 
1917 #ifdef COMPILER1
1918   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1919   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1920     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1921   }
1922 #endif // COMPILER1
1923 
1924   // The instruction at the verified entry point must be 5 bytes or longer
1925   // because it can be patched on the fly by make_non_entrant. The stack bang
1926   // instruction fits that requirement.
1927 
1928   // Generate stack overflow check
1929   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1930 
1931   // Generate a new frame for the wrapper.
1932   __ enter();
1933   // -2 because return address is already present and so is saved rbp
1934   __ subptr(rsp, stack_size - 2*wordSize);
1935 
1936   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1937   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1938   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1939 
1940   // Frame is now completed as far as size and linkage.
1941   int frame_complete = ((intptr_t)__ pc()) - start;
1942 
1943 #ifdef ASSERT
1944   __ check_stack_alignment(rsp, "improperly aligned stack");
1945 #endif /* ASSERT */
1946 
1947 
1948   // We use r14 as the oop handle for the receiver/klass
1949   // It is callee save so it survives the call to native
1950 
1951   const Register oop_handle_reg = r14;
1952 
1953   //
1954   // We immediately shuffle the arguments so that any vm call we have to
1955   // make from here on out (sync slow path, jvmti, etc.) we will have
1956   // captured the oops from our caller and have a valid oopMap for
1957   // them.
1958 
1959   // -----------------
1960   // The Grand Shuffle
1961 
1962   // The Java calling convention is either equal (linux) or denser (win64) than the
1963   // c calling convention. However the because of the jni_env argument the c calling
1964   // convention always has at least one more (and two for static) arguments than Java.
1965   // Therefore if we move the args from java -> c backwards then we will never have
1966   // a register->register conflict and we don't have to build a dependency graph
1967   // and figure out how to break any cycles.
1968   //
1969 
1970   // Record esp-based slot for receiver on stack for non-static methods
1971   int receiver_offset = -1;
1972 
1973   // This is a trick. We double the stack slots so we can claim
1974   // the oops in the caller's frame. Since we are sure to have
1975   // more args than the caller doubling is enough to make
1976   // sure we can capture all the incoming oop args from the
1977   // caller.
1978   //
1979   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1980 
1981   // Mark location of rbp (someday)
1982   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1983 
1984   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1985   // All inbound args are referenced based on rbp and all outbound args via rsp.
1986 
1987 
1988 #ifdef ASSERT
1989   bool reg_destroyed[Register::number_of_registers];
1990   bool freg_destroyed[XMMRegister::number_of_registers];
1991   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1992     reg_destroyed[r] = false;
1993   }
1994   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1995     freg_destroyed[f] = false;
1996   }
1997 
1998 #endif /* ASSERT */
1999 
2000   // For JNI natives the incoming and outgoing registers are offset upwards.
2001   GrowableArray<int> arg_order(2 * total_in_args);
2002 
2003   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2004     arg_order.push(i);
2005     arg_order.push(c_arg);
2006   }
2007 
2008   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2009     int i = arg_order.at(ai);
2010     int c_arg = arg_order.at(ai + 1);
2011     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2012 #ifdef ASSERT
2013     if (in_regs[i].first()->is_Register()) {
2014       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2015     } else if (in_regs[i].first()->is_XMMRegister()) {
2016       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2017     }
2018     if (out_regs[c_arg].first()->is_Register()) {
2019       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2020     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2021       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2022     }
2023 #endif /* ASSERT */
2024     switch (in_sig_bt[i]) {
2025       case T_ARRAY:
2026       case T_OBJECT:
2027         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2028                     ((i == 0) && (!is_static)),
2029                     &receiver_offset);
2030         break;
2031       case T_VOID:
2032         break;
2033 
2034       case T_FLOAT:
2035         __ float_move(in_regs[i], out_regs[c_arg]);
2036           break;
2037 
2038       case T_DOUBLE:
2039         assert( i + 1 < total_in_args &&
2040                 in_sig_bt[i + 1] == T_VOID &&
2041                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2042         __ double_move(in_regs[i], out_regs[c_arg]);
2043         break;
2044 
2045       case T_LONG :
2046         __ long_move(in_regs[i], out_regs[c_arg]);
2047         break;
2048 
2049       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2050 
2051       default:
2052         __ move32_64(in_regs[i], out_regs[c_arg]);
2053     }
2054   }
2055 
2056   int c_arg;
2057 
2058   // Pre-load a static method's oop into r14.  Used both by locking code and
2059   // the normal JNI call code.
2060   // point c_arg at the first arg that is already loaded in case we
2061   // need to spill before we call out
2062   c_arg = total_c_args - total_in_args;
2063 
2064   if (method->is_static()) {
2065 
2066     //  load oop into a register
2067     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2068 
2069     // Now handlize the static class mirror it's known not-null.
2070     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2071     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2072 
2073     // Now get the handle
2074     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2075     // store the klass handle as second argument
2076     __ movptr(c_rarg1, oop_handle_reg);
2077     // and protect the arg if we must spill
2078     c_arg--;
2079   }
2080 
2081   // Change state to native (we save the return address in the thread, since it might not
2082   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2083   // points into the right code segment. It does not have to be the correct return pc.
2084   // We use the same pc/oopMap repeatedly when we call out
2085 
2086   Label native_return;
2087   if (method->is_object_wait0()) {
2088     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2089     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2090   } else {
2091     intptr_t the_pc = (intptr_t) __ pc();
2092     oop_maps->add_gc_map(the_pc - start, map);
2093 
2094     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2095   }
2096 
2097   // We have all of the arguments setup at this point. We must not touch any register
2098   // argument registers at this point (what if we save/restore them there are no oop?
2099 
2100   if (DTraceMethodProbes) {
2101     // protect the args we've loaded
2102     save_args(masm, total_c_args, c_arg, out_regs);
2103     __ mov_metadata(c_rarg1, method());
2104     __ call_VM_leaf(
2105       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2106       r15_thread, c_rarg1);
2107     restore_args(masm, total_c_args, c_arg, out_regs);
2108   }
2109 
2110   // RedefineClasses() tracing support for obsolete method entry
2111   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2112     // protect the args we've loaded
2113     save_args(masm, total_c_args, c_arg, out_regs);
2114     __ mov_metadata(c_rarg1, method());
2115     __ call_VM_leaf(
2116       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2117       r15_thread, c_rarg1);
2118     restore_args(masm, total_c_args, c_arg, out_regs);
2119   }
2120 
2121   // Lock a synchronized method
2122 
2123   // Register definitions used by locking and unlocking
2124 
2125   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2126   const Register obj_reg  = rbx;  // Will contain the oop
2127   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2128 
2129   Label slow_path_lock;
2130   Label lock_done;
2131 
2132   if (method->is_synchronized()) {
2133     // Get the handle (the 2nd argument)
2134     __ mov(oop_handle_reg, c_rarg1);
2135 
2136     // Get address of the box
2137 
2138     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2139 
2140     // Load the oop from the handle
2141     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2142 
2143     __ fast_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2144 
2145     // Slow path will re-enter here
2146     __ bind(lock_done);
2147   }
2148 
2149   // Finally just about ready to make the JNI call
2150 
2151   // get JNIEnv* which is first argument to native
2152   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2153 
2154   // Now set thread in native
2155   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2156 
2157   __ call(RuntimeAddress(native_func));
2158 
2159   // Verify or restore cpu control state after JNI call
2160   __ restore_cpu_control_state_after_jni(rscratch1);
2161 
2162   // Unpack native results.
2163   switch (ret_type) {
2164   case T_BOOLEAN: __ c2bool(rax);            break;
2165   case T_CHAR   : __ movzwl(rax, rax);      break;
2166   case T_BYTE   : __ sign_extend_byte (rax); break;
2167   case T_SHORT  : __ sign_extend_short(rax); break;
2168   case T_INT    : /* nothing to do */        break;
2169   case T_DOUBLE :
2170   case T_FLOAT  :
2171     // Result is in xmm0 we'll save as needed
2172     break;
2173   case T_ARRAY:                 // Really a handle
2174   case T_OBJECT:                // Really a handle
2175       break; // can't de-handlize until after safepoint check
2176   case T_VOID: break;
2177   case T_LONG: break;
2178   default       : ShouldNotReachHere();
2179   }
2180 
2181   // Switch thread to "native transition" state before reading the synchronization state.
2182   // This additional state is necessary because reading and testing the synchronization
2183   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2184   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2185   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2186   //     Thread A is resumed to finish this native method, but doesn't block here since it
2187   //     didn't see any synchronization is progress, and escapes.
2188   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2189 
2190   // Force this write out before the read below
2191   if (!UseSystemMemoryBarrier) {
2192     __ membar(Assembler::Membar_mask_bits(
2193               Assembler::LoadLoad | Assembler::LoadStore |
2194               Assembler::StoreLoad | Assembler::StoreStore));
2195   }
2196 
2197   // check for safepoint operation in progress and/or pending suspend requests
2198   {
2199     Label Continue;
2200     Label slow_path;
2201 
2202     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2203 
2204     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2205     __ jcc(Assembler::equal, Continue);
2206     __ bind(slow_path);
2207 
2208     // Don't use call_VM as it will see a possible pending exception and forward it
2209     // and never return here preventing us from clearing _last_native_pc down below.
2210     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2211     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2212     // by hand.
2213     //
2214     __ vzeroupper();
2215     save_native_result(masm, ret_type, stack_slots);
2216     __ mov(c_rarg0, r15_thread);
2217     __ mov(r12, rsp); // remember sp
2218     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2219     __ andptr(rsp, -16); // align stack as required by ABI
2220     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2221     __ mov(rsp, r12); // restore sp
2222     __ reinit_heapbase();
2223     // Restore any method result value
2224     restore_native_result(masm, ret_type, stack_slots);
2225     __ bind(Continue);
2226   }
2227 
2228   // change thread state
2229   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2230 
2231   if (method->is_object_wait0()) {
2232     // Check preemption for Object.wait()
2233     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2234     __ cmpptr(rscratch1, NULL_WORD);
2235     __ jccb(Assembler::equal, native_return);
2236     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2237     __ jmp(rscratch1);
2238     __ bind(native_return);
2239 
2240     intptr_t the_pc = (intptr_t) __ pc();
2241     oop_maps->add_gc_map(the_pc - start, map);
2242   }
2243 
2244 
2245   Label reguard;
2246   Label reguard_done;
2247   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2248   __ jcc(Assembler::equal, reguard);
2249   __ bind(reguard_done);
2250 
2251   // native result if any is live
2252 
2253   // Unlock
2254   Label slow_path_unlock;
2255   Label unlock_done;
2256   if (method->is_synchronized()) {
2257 
2258     Label fast_done;
2259 
2260     // Get locked oop from the handle we passed to jni
2261     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2262 
2263     // Must save rax if it is live now because cmpxchg must use it
2264     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2265       save_native_result(masm, ret_type, stack_slots);
2266     }
2267 
2268     __ fast_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2269 
2270     // slow path re-enters here
2271     __ bind(unlock_done);
2272     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2273       restore_native_result(masm, ret_type, stack_slots);
2274     }
2275 
2276     __ bind(fast_done);
2277   }
2278   if (DTraceMethodProbes) {
2279     save_native_result(masm, ret_type, stack_slots);
2280     __ mov_metadata(c_rarg1, method());
2281     __ call_VM_leaf(
2282          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2283          r15_thread, c_rarg1);
2284     restore_native_result(masm, ret_type, stack_slots);
2285   }
2286 
2287   __ reset_last_Java_frame(false);
2288 
2289   // Unbox oop result, e.g. JNIHandles::resolve value.
2290   if (is_reference_type(ret_type)) {
2291     __ resolve_jobject(rax /* value */,
2292                        rcx /* tmp */);
2293   }
2294 
2295   if (CheckJNICalls) {
2296     // clear_pending_jni_exception_check
2297     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2298   }
2299 
2300   // reset handle block
2301   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2302   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2303 
2304   // pop our frame
2305 
2306   __ leave();
2307 
2308 #if INCLUDE_JFR
2309   // We need to do a poll test after unwind in case the sampler
2310   // managed to sample the native frame after returning to Java.
2311   Label L_return;
2312   address poll_test_pc = __ pc();
2313   __ relocate(relocInfo::poll_return_type);
2314   __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2315   __ jccb(Assembler::zero, L_return);
2316   __ lea(rscratch1, InternalAddress(poll_test_pc));
2317   __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2318   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2319     "polling page return stub not created yet");
2320   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2321   __ jump(RuntimeAddress(stub));
2322   __ bind(L_return);
2323 #endif // INCLUDE_JFR
2324 
2325   // Any exception pending?
2326   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2327   __ jcc(Assembler::notEqual, exception_pending);
2328 
2329   // Return
2330 
2331   __ ret(0);
2332 
2333   // Unexpected paths are out of line and go here
2334 
2335   // forward the exception
2336   __ bind(exception_pending);
2337 
2338   // and forward the exception
2339   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2340 
2341   // Slow path locking & unlocking
2342   if (method->is_synchronized()) {
2343 
2344     // BEGIN Slow path lock
2345     __ bind(slow_path_lock);
2346 
2347     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2348     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2349 
2350     // protect the args we've loaded
2351     save_args(masm, total_c_args, c_arg, out_regs);
2352 
2353     __ mov(c_rarg0, obj_reg);
2354     __ mov(c_rarg1, lock_reg);
2355     __ mov(c_rarg2, r15_thread);
2356 
2357     // Not a leaf but we have last_Java_frame setup as we want.
2358     // We don't want to unmount in case of contention since that would complicate preserving
2359     // the arguments that had already been marshalled into the native convention. So we force
2360     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2361     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2362     __ push_cont_fastpath();
2363     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2364     __ pop_cont_fastpath();
2365     restore_args(masm, total_c_args, c_arg, out_regs);
2366 
2367 #ifdef ASSERT
2368     { Label L;
2369     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2370     __ jcc(Assembler::equal, L);
2371     __ stop("no pending exception allowed on exit from monitorenter");
2372     __ bind(L);
2373     }
2374 #endif
2375     __ jmp(lock_done);
2376 
2377     // END Slow path lock
2378 
2379     // BEGIN Slow path unlock
2380     __ bind(slow_path_unlock);
2381 
2382     // If we haven't already saved the native result we must save it now as xmm registers
2383     // are still exposed.
2384     __ vzeroupper();
2385     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2386       save_native_result(masm, ret_type, stack_slots);
2387     }
2388 
2389     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2390 
2391     __ mov(c_rarg0, obj_reg);
2392     __ mov(c_rarg2, r15_thread);
2393     __ mov(r12, rsp); // remember sp
2394     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2395     __ andptr(rsp, -16); // align stack as required by ABI
2396 
2397     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2398     // NOTE that obj_reg == rbx currently
2399     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2400     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2401 
2402     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2403     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2404     __ mov(rsp, r12); // restore sp
2405     __ reinit_heapbase();
2406 #ifdef ASSERT
2407     {
2408       Label L;
2409       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2410       __ jcc(Assembler::equal, L);
2411       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2412       __ bind(L);
2413     }
2414 #endif /* ASSERT */
2415 
2416     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2417 
2418     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2419       restore_native_result(masm, ret_type, stack_slots);
2420     }
2421     __ jmp(unlock_done);
2422 
2423     // END Slow path unlock
2424 
2425   } // synchronized
2426 
2427   // SLOW PATH Reguard the stack if needed
2428 
2429   __ bind(reguard);
2430   __ vzeroupper();
2431   save_native_result(masm, ret_type, stack_slots);
2432   __ mov(r12, rsp); // remember sp
2433   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2434   __ andptr(rsp, -16); // align stack as required by ABI
2435   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2436   __ mov(rsp, r12); // restore sp
2437   __ reinit_heapbase();
2438   restore_native_result(masm, ret_type, stack_slots);
2439   // and continue
2440   __ jmp(reguard_done);
2441 
2442 
2443 
2444   __ flush();
2445 
2446   nmethod *nm = nmethod::new_native_nmethod(method,
2447                                             compile_id,
2448                                             masm->code(),
2449                                             vep_offset,
2450                                             frame_complete,
2451                                             stack_slots / VMRegImpl::slots_per_word,
2452                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2453                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2454                                             oop_maps);
2455 
2456   return nm;
2457 }
2458 
2459 // this function returns the adjust size (in number of words) to a c2i adapter
2460 // activation for use during deoptimization
2461 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2462   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2463 }
2464 
2465 
2466 uint SharedRuntime::out_preserve_stack_slots() {
2467   return 0;
2468 }
2469 
2470 
2471 // Number of stack slots between incoming argument block and the start of
2472 // a new frame.  The PROLOG must add this many slots to the stack.  The
2473 // EPILOG must remove this many slots.  amd64 needs two slots for
2474 // return address.
2475 uint SharedRuntime::in_preserve_stack_slots() {
2476   return 4 + 2 * VerifyStackAtCalls;
2477 }
2478 
2479 VMReg SharedRuntime::thread_register() {
2480   return r15_thread->as_VMReg();
2481 }
2482 
2483 //------------------------------generate_deopt_blob----------------------------
2484 void SharedRuntime::generate_deopt_blob() {
2485   // Allocate space for the code
2486   ResourceMark rm;
2487   // Setup code generation tools
2488   int pad = 0;
2489   if (UseAVX > 2) {
2490     pad += 1024;
2491   }
2492   if (UseAPX) {
2493     pad += 1024;
2494   }
2495 #if INCLUDE_JVMCI
2496   if (EnableJVMCI) {
2497     pad += 512; // Increase the buffer size when compiling for JVMCI
2498   }
2499 #endif
2500   const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2501   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2502   if (blob != nullptr) {
2503     _deopt_blob = blob->as_deoptimization_blob();
2504     return;
2505   }
2506 
2507   CodeBuffer buffer(name, 2560+pad, 1024);
2508   MacroAssembler* masm = new MacroAssembler(&buffer);
2509   int frame_size_in_words;
2510   OopMap* map = nullptr;
2511   OopMapSet *oop_maps = new OopMapSet();
2512 
2513   // -------------
2514   // This code enters when returning to a de-optimized nmethod.  A return
2515   // address has been pushed on the stack, and return values are in
2516   // registers.
2517   // If we are doing a normal deopt then we were called from the patched
2518   // nmethod from the point we returned to the nmethod. So the return
2519   // address on the stack is wrong by NativeCall::instruction_size
2520   // We will adjust the value so it looks like we have the original return
2521   // address on the stack (like when we eagerly deoptimized).
2522   // In the case of an exception pending when deoptimizing, we enter
2523   // with a return address on the stack that points after the call we patched
2524   // into the exception handler. We have the following register state from,
2525   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2526   //    rax: exception oop
2527   //    rbx: exception handler
2528   //    rdx: throwing pc
2529   // So in this case we simply jam rdx into the useless return address and
2530   // the stack looks just like we want.
2531   //
2532   // At this point we need to de-opt.  We save the argument return
2533   // registers.  We call the first C routine, fetch_unroll_info().  This
2534   // routine captures the return values and returns a structure which
2535   // describes the current frame size and the sizes of all replacement frames.
2536   // The current frame is compiled code and may contain many inlined
2537   // functions, each with their own JVM state.  We pop the current frame, then
2538   // push all the new frames.  Then we call the C routine unpack_frames() to
2539   // populate these frames.  Finally unpack_frames() returns us the new target
2540   // address.  Notice that callee-save registers are BLOWN here; they have
2541   // already been captured in the vframeArray at the time the return PC was
2542   // patched.
2543   address start = __ pc();
2544   Label cont;
2545 
2546   // Prolog for non exception case!
2547 
2548   // Save everything in sight.
2549   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2550 
2551   // Normal deoptimization.  Save exec mode for unpack_frames.
2552   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2553   __ jmp(cont);
2554 
2555   int reexecute_offset = __ pc() - start;
2556 #if INCLUDE_JVMCI && !defined(COMPILER1)
2557   if (UseJVMCICompiler) {
2558     // JVMCI does not use this kind of deoptimization
2559     __ should_not_reach_here();
2560   }
2561 #endif
2562 
2563   // Reexecute case
2564   // return address is the pc describes what bci to do re-execute at
2565 
2566   // No need to update map as each call to save_live_registers will produce identical oopmap
2567   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2568 
2569   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2570   __ jmp(cont);
2571 
2572 #if INCLUDE_JVMCI
2573   Label after_fetch_unroll_info_call;
2574   int implicit_exception_uncommon_trap_offset = 0;
2575   int uncommon_trap_offset = 0;
2576 
2577   if (EnableJVMCI) {
2578     implicit_exception_uncommon_trap_offset = __ pc() - start;
2579 
2580     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2581     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2582 
2583     uncommon_trap_offset = __ pc() - start;
2584 
2585     // Save everything in sight.
2586     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2587     // fetch_unroll_info needs to call last_java_frame()
2588     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2589 
2590     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2591     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2592 
2593     __ movl(r14, Deoptimization::Unpack_reexecute);
2594     __ mov(c_rarg0, r15_thread);
2595     __ movl(c_rarg2, r14); // exec mode
2596     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2597     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2598 
2599     __ reset_last_Java_frame(false);
2600 
2601     __ jmp(after_fetch_unroll_info_call);
2602   } // EnableJVMCI
2603 #endif // INCLUDE_JVMCI
2604 
2605   int exception_offset = __ pc() - start;
2606 
2607   // Prolog for exception case
2608 
2609   // all registers are dead at this entry point, except for rax, and
2610   // rdx which contain the exception oop and exception pc
2611   // respectively.  Set them in TLS and fall thru to the
2612   // unpack_with_exception_in_tls entry point.
2613 
2614   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2615   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2616 
2617   int exception_in_tls_offset = __ pc() - start;
2618 
2619   // new implementation because exception oop is now passed in JavaThread
2620 
2621   // Prolog for exception case
2622   // All registers must be preserved because they might be used by LinearScan
2623   // Exceptiop oop and throwing PC are passed in JavaThread
2624   // tos: stack at point of call to method that threw the exception (i.e. only
2625   // args are on the stack, no return address)
2626 
2627   // make room on stack for the return address
2628   // It will be patched later with the throwing pc. The correct value is not
2629   // available now because loading it from memory would destroy registers.
2630   __ push(0);
2631 
2632   // Save everything in sight.
2633   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2634 
2635   // Now it is safe to overwrite any register
2636 
2637   // Deopt during an exception.  Save exec mode for unpack_frames.
2638   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2639 
2640   // load throwing pc from JavaThread and patch it as the return address
2641   // of the current frame. Then clear the field in JavaThread
2642 
2643   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2644   __ movptr(Address(rbp, wordSize), rdx);
2645   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2646 
2647 #ifdef ASSERT
2648   // verify that there is really an exception oop in JavaThread
2649   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2650   __ verify_oop(rax);
2651 
2652   // verify that there is no pending exception
2653   Label no_pending_exception;
2654   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2655   __ testptr(rax, rax);
2656   __ jcc(Assembler::zero, no_pending_exception);
2657   __ stop("must not have pending exception here");
2658   __ bind(no_pending_exception);
2659 #endif
2660 
2661   __ bind(cont);
2662 
2663   // Call C code.  Need thread and this frame, but NOT official VM entry
2664   // crud.  We cannot block on this call, no GC can happen.
2665   //
2666   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2667 
2668   // fetch_unroll_info needs to call last_java_frame().
2669 
2670   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2671 #ifdef ASSERT
2672   { Label L;
2673     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2674     __ jcc(Assembler::equal, L);
2675     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2676     __ bind(L);
2677   }
2678 #endif // ASSERT
2679   __ mov(c_rarg0, r15_thread);
2680   __ movl(c_rarg1, r14); // exec_mode
2681   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2682 
2683   // Need to have an oopmap that tells fetch_unroll_info where to
2684   // find any register it might need.
2685   oop_maps->add_gc_map(__ pc() - start, map);
2686 
2687   __ reset_last_Java_frame(false);
2688 
2689 #if INCLUDE_JVMCI
2690   if (EnableJVMCI) {
2691     __ bind(after_fetch_unroll_info_call);
2692   }
2693 #endif
2694 
2695   // Load UnrollBlock* into rdi
2696   __ mov(rdi, rax);
2697 
2698   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2699    Label noException;
2700   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2701   __ jcc(Assembler::notEqual, noException);
2702   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2703   // QQQ this is useless it was null above
2704   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2705   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2706   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2707 
2708   __ verify_oop(rax);
2709 
2710   // Overwrite the result registers with the exception results.
2711   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2712   // I think this is useless
2713   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2714 
2715   __ bind(noException);
2716 
2717   // Only register save data is on the stack.
2718   // Now restore the result registers.  Everything else is either dead
2719   // or captured in the vframeArray.
2720   RegisterSaver::restore_result_registers(masm);
2721 
2722   // All of the register save area has been popped of the stack. Only the
2723   // return address remains.
2724 
2725   // Pop all the frames we must move/replace.
2726   //
2727   // Frame picture (youngest to oldest)
2728   // 1: self-frame (no frame link)
2729   // 2: deopting frame  (no frame link)
2730   // 3: caller of deopting frame (could be compiled/interpreted).
2731   //
2732   // Note: by leaving the return address of self-frame on the stack
2733   // and using the size of frame 2 to adjust the stack
2734   // when we are done the return to frame 3 will still be on the stack.
2735 
2736   // Pop deoptimized frame
2737   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2738   __ addptr(rsp, rcx);
2739 
2740   // rsp should be pointing at the return address to the caller (3)
2741 
2742   // Pick up the initial fp we should save
2743   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2744   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2745 
2746 #ifdef ASSERT
2747   // Compilers generate code that bang the stack by as much as the
2748   // interpreter would need. So this stack banging should never
2749   // trigger a fault. Verify that it does not on non product builds.
2750   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2751   __ bang_stack_size(rbx, rcx);
2752 #endif
2753 
2754   // Load address of array of frame pcs into rcx
2755   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2756 
2757   // Trash the old pc
2758   __ addptr(rsp, wordSize);
2759 
2760   // Load address of array of frame sizes into rsi
2761   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2762 
2763   // Load counter into rdx
2764   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2765 
2766   // Now adjust the caller's stack to make up for the extra locals
2767   // but record the original sp so that we can save it in the skeletal interpreter
2768   // frame and the stack walking of interpreter_sender will get the unextended sp
2769   // value and not the "real" sp value.
2770 
2771   const Register sender_sp = r8;
2772 
2773   __ mov(sender_sp, rsp);
2774   __ movl(rbx, Address(rdi,
2775                        Deoptimization::UnrollBlock::
2776                        caller_adjustment_offset()));
2777   __ subptr(rsp, rbx);
2778 
2779   // Push interpreter frames in a loop
2780   Label loop;
2781   __ bind(loop);
2782   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2783   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2784   __ pushptr(Address(rcx, 0));          // Save return address
2785   __ enter();                           // Save old & set new ebp
2786   __ subptr(rsp, rbx);                  // Prolog
2787   // This value is corrected by layout_activation_impl
2788   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2789   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2790   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2791   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2792   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2793   __ decrementl(rdx);                   // Decrement counter
2794   __ jcc(Assembler::notZero, loop);
2795   __ pushptr(Address(rcx, 0));          // Save final return address
2796 
2797   // Re-push self-frame
2798   __ enter();                           // Save old & set new ebp
2799 
2800   // Allocate a full sized register save area.
2801   // Return address and rbp are in place, so we allocate two less words.
2802   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2803 
2804   // Restore frame locals after moving the frame
2805   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2806   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2807 
2808   // Call C code.  Need thread but NOT official VM entry
2809   // crud.  We cannot block on this call, no GC can happen.  Call should
2810   // restore return values to their stack-slots with the new SP.
2811   //
2812   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2813 
2814   // Use rbp because the frames look interpreted now
2815   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2816   // Don't need the precise return PC here, just precise enough to point into this code blob.
2817   address the_pc = __ pc();
2818   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2819 
2820   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2821   __ mov(c_rarg0, r15_thread);
2822   __ movl(c_rarg1, r14); // second arg: exec_mode
2823   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2824   // Revert SP alignment after call since we're going to do some SP relative addressing below
2825   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2826 
2827   // Set an oopmap for the call site
2828   // Use the same PC we used for the last java frame
2829   oop_maps->add_gc_map(the_pc - start,
2830                        new OopMap( frame_size_in_words, 0 ));
2831 
2832   // Clear fp AND pc
2833   __ reset_last_Java_frame(true);
2834 
2835   // Collect return values
2836   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2837   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2838   // I think this is useless (throwing pc?)
2839   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2840 
2841   // Pop self-frame.
2842   __ leave();                           // Epilog
2843 
2844   // Jump to interpreter
2845   __ ret(0);
2846 
2847   // Make sure all code is generated
2848   masm->flush();
2849 
2850   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2851   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2852 #if INCLUDE_JVMCI
2853   if (EnableJVMCI) {
2854     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2855     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2856   }
2857 #endif
2858 
2859   AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2860 }
2861 
2862 //------------------------------generate_handler_blob------
2863 //
2864 // Generate a special Compile2Runtime blob that saves all registers,
2865 // and setup oopmap.
2866 //
2867 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
2868   assert(StubRoutines::forward_exception_entry() != nullptr,
2869          "must be generated before");
2870   assert(is_polling_page_id(id), "expected a polling page stub id");
2871 
2872   // Allocate space for the code.  Setup code generation tools.
2873   const char* name = SharedRuntime::stub_name(id);
2874   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
2875   if (blob != nullptr) {
2876     return blob->as_safepoint_blob();
2877   }
2878 
2879   ResourceMark rm;
2880   OopMapSet *oop_maps = new OopMapSet();
2881   OopMap* map;
2882   CodeBuffer buffer(name, 2548, 1024);
2883   MacroAssembler* masm = new MacroAssembler(&buffer);
2884 
2885   address start   = __ pc();
2886   address call_pc = nullptr;
2887   int frame_size_in_words;
2888   bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
2889   bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
2890 
2891   // Make room for return address (or push it again)
2892   if (!cause_return) {
2893     __ push(rbx);
2894   }
2895 
2896   // Save registers, fpu state, and flags
2897   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
2898 
2899   // The following is basically a call_VM.  However, we need the precise
2900   // address of the call in order to generate an oopmap. Hence, we do all the
2901   // work ourselves.
2902 
2903   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
2904 
2905   // The return address must always be correct so that frame constructor never
2906   // sees an invalid pc.
2907 
2908   if (!cause_return) {
2909     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2910     // Additionally, rbx is a callee saved register and we can look at it later to determine
2911     // if someone changed the return address for us!
2912     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
2913     __ movptr(Address(rbp, wordSize), rbx);
2914   }
2915 
2916   // Do the call
2917   __ mov(c_rarg0, r15_thread);
2918   __ call(RuntimeAddress(call_ptr));
2919 
2920   // Set an oopmap for the call site.  This oopmap will map all
2921   // oop-registers and debug-info registers as callee-saved.  This
2922   // will allow deoptimization at this safepoint to find all possible
2923   // debug-info recordings, as well as let GC find all oops.
2924 
2925   oop_maps->add_gc_map( __ pc() - start, map);
2926 
2927   Label noException;
2928 
2929   __ reset_last_Java_frame(false);
2930 
2931   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
2932   __ jcc(Assembler::equal, noException);
2933 
2934   // Exception pending
2935 
2936   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
2937 
2938   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2939 
2940   // No exception case
2941   __ bind(noException);
2942 
2943   Label no_adjust;
2944 #ifdef ASSERT
2945   Label bail;
2946 #endif
2947   if (!cause_return) {
2948     Label no_prefix, not_special, check_rex_prefix;
2949 
2950     // If our stashed return pc was modified by the runtime we avoid touching it
2951     __ cmpptr(rbx, Address(rbp, wordSize));
2952     __ jcc(Assembler::notEqual, no_adjust);
2953 
2954     // Skip over the poll instruction.
2955     // See NativeInstruction::is_safepoint_poll()
2956     // Possible encodings:
2957     //      85 00       test   %eax,(%rax)
2958     //      85 01       test   %eax,(%rcx)
2959     //      85 02       test   %eax,(%rdx)
2960     //      85 03       test   %eax,(%rbx)
2961     //      85 06       test   %eax,(%rsi)
2962     //      85 07       test   %eax,(%rdi)
2963     //
2964     //   41 85 00       test   %eax,(%r8)
2965     //   41 85 01       test   %eax,(%r9)
2966     //   41 85 02       test   %eax,(%r10)
2967     //   41 85 03       test   %eax,(%r11)
2968     //   41 85 06       test   %eax,(%r14)
2969     //   41 85 07       test   %eax,(%r15)
2970     //
2971     //      85 04 24    test   %eax,(%rsp)
2972     //   41 85 04 24    test   %eax,(%r12)
2973     //      85 45 00    test   %eax,0x0(%rbp)
2974     //   41 85 45 00    test   %eax,0x0(%r13)
2975     //
2976     // Notes:
2977     //  Format of legacy MAP0 test instruction:-
2978     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
2979     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
2980     //     operand and base register of memory operand is b/w [0-8), hence we do not require
2981     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
2982     //     is why two bytes encoding is sufficient here.
2983     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
2984     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
2985     //     there by adding additional byte to instruction encoding.
2986     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
2987     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
2988     //     most significant two bits of 5 bit register encoding.
2989 
2990     if (VM_Version::supports_apx_f()) {
2991       __ cmpb(Address(rbx, 0), Assembler::REX2);
2992       __ jccb(Assembler::notEqual, check_rex_prefix);
2993       __ addptr(rbx, 2);
2994       __ bind(check_rex_prefix);
2995     }
2996     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
2997     __ jccb(Assembler::notEqual, no_prefix);
2998     __ addptr(rbx, 1);
2999     __ bind(no_prefix);
3000 #ifdef ASSERT
3001     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3002 #endif
3003     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3004     // r12/rsp 0x04
3005     // r13/rbp 0x05
3006     __ movzbq(rcx, Address(rbx, 1));
3007     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3008     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3009     __ cmpptr(rcx, 1);
3010     __ jccb(Assembler::above, not_special);
3011     __ addptr(rbx, 1);
3012     __ bind(not_special);
3013 #ifdef ASSERT
3014     // Verify the correct encoding of the poll we're about to skip.
3015     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3016     __ jcc(Assembler::notEqual, bail);
3017     // Mask out the modrm bits
3018     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3019     // rax encodes to 0, so if the bits are nonzero it's incorrect
3020     __ jcc(Assembler::notZero, bail);
3021 #endif
3022     // Adjust return pc forward to step over the safepoint poll instruction
3023     __ addptr(rbx, 2);
3024     __ movptr(Address(rbp, wordSize), rbx);
3025   }
3026 
3027   __ bind(no_adjust);
3028   // Normal exit, restore registers and exit.
3029   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3030   __ ret(0);
3031 
3032 #ifdef ASSERT
3033   __ bind(bail);
3034   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3035 #endif
3036 
3037   // Make sure all code is generated
3038   masm->flush();
3039 
3040   // Fill-out other meta info
3041   SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3042 
3043   AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3044   return sp_blob;
3045 }
3046 
3047 //
3048 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3049 //
3050 // Generate a stub that calls into vm to find out the proper destination
3051 // of a java call. All the argument registers are live at this point
3052 // but since this is generic code we don't know what they are and the caller
3053 // must do any gc of the args.
3054 //
3055 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3056   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3057   assert(is_resolve_id(id), "expected a resolve stub id");
3058 
3059   const char* name = SharedRuntime::stub_name(id);
3060   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3061   if (blob != nullptr) {
3062     return blob->as_runtime_stub();
3063   }
3064 
3065   // allocate space for the code
3066   ResourceMark rm;
3067   CodeBuffer buffer(name, 1552, 512);
3068   MacroAssembler* masm = new MacroAssembler(&buffer);
3069 
3070   int frame_size_in_words;
3071 
3072   OopMapSet *oop_maps = new OopMapSet();
3073   OopMap* map = nullptr;
3074 
3075   int start = __ offset();
3076 
3077   // No need to save vector registers since they are caller-saved anyway.
3078   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3079 
3080   int frame_complete = __ offset();
3081 
3082   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3083 
3084   __ mov(c_rarg0, r15_thread);
3085 
3086   __ call(RuntimeAddress(destination));
3087 
3088 
3089   // Set an oopmap for the call site.
3090   // We need this not only for callee-saved registers, but also for volatile
3091   // registers that the compiler might be keeping live across a safepoint.
3092 
3093   oop_maps->add_gc_map( __ offset() - start, map);
3094 
3095   // rax contains the address we are going to jump to assuming no exception got installed
3096 
3097   // clear last_Java_sp
3098   __ reset_last_Java_frame(false);
3099   // check for pending exceptions
3100   Label pending;
3101   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3102   __ jcc(Assembler::notEqual, pending);
3103 
3104   // get the returned Method*
3105   __ get_vm_result_metadata(rbx);
3106   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3107 
3108   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3109 
3110   RegisterSaver::restore_live_registers(masm);
3111 
3112   // We are back to the original state on entry and ready to go.
3113 
3114   __ jmp(rax);
3115 
3116   // Pending exception after the safepoint
3117 
3118   __ bind(pending);
3119 
3120   RegisterSaver::restore_live_registers(masm);
3121 
3122   // exception pending => remove activation and forward to exception handler
3123 
3124   __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3125 
3126   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3127   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3128 
3129   // -------------
3130   // make sure all code is generated
3131   masm->flush();
3132 
3133   // return the  blob
3134   // frame_size_words or bytes??
3135   RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3136 
3137   AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3138   return rs_blob;
3139 }
3140 
3141 // Continuation point for throwing of implicit exceptions that are
3142 // not handled in the current activation. Fabricates an exception
3143 // oop and initiates normal exception dispatching in this
3144 // frame. Since we need to preserve callee-saved values (currently
3145 // only for C2, but done for C1 as well) we need a callee-saved oop
3146 // map and therefore have to make these stubs into RuntimeStubs
3147 // rather than BufferBlobs.  If the compiler needs all registers to
3148 // be preserved between the fault point and the exception handler
3149 // then it must assume responsibility for that in
3150 // AbstractCompiler::continuation_for_implicit_null_exception or
3151 // continuation_for_implicit_division_by_zero_exception. All other
3152 // implicit exceptions (e.g., NullPointerException or
3153 // AbstractMethodError on entry) are either at call sites or
3154 // otherwise assume that stack unwinding will be initiated, so
3155 // caller saved registers were assumed volatile in the compiler.
3156 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3157   assert(is_throw_id(id), "expected a throw stub id");
3158 
3159   const char* name = SharedRuntime::stub_name(id);
3160 
3161   // Information about frame layout at time of blocking runtime call.
3162   // Note that we only have to preserve callee-saved registers since
3163   // the compilers are responsible for supplying a continuation point
3164   // if they expect all registers to be preserved.
3165   enum layout {
3166     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3167     rbp_off2,
3168     return_off,
3169     return_off2,
3170     framesize // inclusive of return address
3171   };
3172 
3173   int insts_size = 512;
3174   int locs_size  = 64;
3175 
3176   const char* timer_msg = "SharedRuntime generate_throw_exception";
3177   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3178 
3179   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3180   if (blob != nullptr) {
3181     return blob->as_runtime_stub();
3182   }
3183 
3184   ResourceMark rm;
3185   CodeBuffer code(name, insts_size, locs_size);
3186   OopMapSet* oop_maps  = new OopMapSet();
3187   MacroAssembler* masm = new MacroAssembler(&code);
3188 
3189   address start = __ pc();
3190 
3191   // This is an inlined and slightly modified version of call_VM
3192   // which has the ability to fetch the return PC out of
3193   // thread-local storage and also sets up last_Java_sp slightly
3194   // differently than the real call_VM
3195 
3196   __ enter(); // required for proper stackwalking of RuntimeStub frame
3197 
3198   assert(is_even(framesize/2), "sp not 16-byte aligned");
3199 
3200   // return address and rbp are already in place
3201   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3202 
3203   int frame_complete = __ pc() - start;
3204 
3205   // Set up last_Java_sp and last_Java_fp
3206   address the_pc = __ pc();
3207   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3208   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3209 
3210   // Call runtime
3211   __ movptr(c_rarg0, r15_thread);
3212   BLOCK_COMMENT("call runtime_entry");
3213   __ call(RuntimeAddress(runtime_entry));
3214 
3215   // Generate oop map
3216   OopMap* map = new OopMap(framesize, 0);
3217 
3218   oop_maps->add_gc_map(the_pc - start, map);
3219 
3220   __ reset_last_Java_frame(true);
3221 
3222   __ leave(); // required for proper stackwalking of RuntimeStub frame
3223 
3224   // check for pending exceptions
3225 #ifdef ASSERT
3226   Label L;
3227   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3228   __ jcc(Assembler::notEqual, L);
3229   __ should_not_reach_here();
3230   __ bind(L);
3231 #endif // ASSERT
3232   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3233 
3234 
3235   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3236   RuntimeStub* stub =
3237     RuntimeStub::new_runtime_stub(name,
3238                                   &code,
3239                                   frame_complete,
3240                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3241                                   oop_maps, false);
3242   AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3243 
3244   return stub;
3245 }
3246 
3247 //------------------------------Montgomery multiplication------------------------
3248 //
3249 
3250 #ifndef _WINDOWS
3251 
3252 // Subtract 0:b from carry:a.  Return carry.
3253 static julong
3254 sub(julong a[], julong b[], julong carry, long len) {
3255   long long i = 0, cnt = len;
3256   julong tmp;
3257   asm volatile("clc; "
3258                "0: ; "
3259                "mov (%[b], %[i], 8), %[tmp]; "
3260                "sbb %[tmp], (%[a], %[i], 8); "
3261                "inc %[i]; dec %[cnt]; "
3262                "jne 0b; "
3263                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3264                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3265                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3266                : "memory");
3267   return tmp;
3268 }
3269 
3270 // Multiply (unsigned) Long A by Long B, accumulating the double-
3271 // length result into the accumulator formed of T0, T1, and T2.
3272 #define MACC(A, B, T0, T1, T2)                                  \
3273 do {                                                            \
3274   unsigned long hi, lo;                                         \
3275   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3276            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3277            : "r"(A), "a"(B) : "cc");                            \
3278  } while(0)
3279 
3280 // As above, but add twice the double-length result into the
3281 // accumulator.
3282 #define MACC2(A, B, T0, T1, T2)                                 \
3283 do {                                                            \
3284   unsigned long hi, lo;                                         \
3285   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3286            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3287            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3288            : "r"(A), "a"(B) : "cc");                            \
3289  } while(0)
3290 
3291 #else //_WINDOWS
3292 
3293 static julong
3294 sub(julong a[], julong b[], julong carry, long len) {
3295   long i;
3296   julong tmp;
3297   unsigned char c = 1;
3298   for (i = 0; i < len; i++) {
3299     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3300     a[i] = tmp;
3301   }
3302   c = _addcarry_u64(c, carry, ~0, &tmp);
3303   return tmp;
3304 }
3305 
3306 // Multiply (unsigned) Long A by Long B, accumulating the double-
3307 // length result into the accumulator formed of T0, T1, and T2.
3308 #define MACC(A, B, T0, T1, T2)                          \
3309 do {                                                    \
3310   julong hi, lo;                            \
3311   lo = _umul128(A, B, &hi);                             \
3312   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3313   c = _addcarry_u64(c, hi, T1, &T1);                    \
3314   _addcarry_u64(c, T2, 0, &T2);                         \
3315  } while(0)
3316 
3317 // As above, but add twice the double-length result into the
3318 // accumulator.
3319 #define MACC2(A, B, T0, T1, T2)                         \
3320 do {                                                    \
3321   julong hi, lo;                            \
3322   lo = _umul128(A, B, &hi);                             \
3323   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3324   c = _addcarry_u64(c, hi, T1, &T1);                    \
3325   _addcarry_u64(c, T2, 0, &T2);                         \
3326   c = _addcarry_u64(0, lo, T0, &T0);                    \
3327   c = _addcarry_u64(c, hi, T1, &T1);                    \
3328   _addcarry_u64(c, T2, 0, &T2);                         \
3329  } while(0)
3330 
3331 #endif //_WINDOWS
3332 
3333 // Fast Montgomery multiplication.  The derivation of the algorithm is
3334 // in  A Cryptographic Library for the Motorola DSP56000,
3335 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3336 
3337 static void NOINLINE
3338 montgomery_multiply(julong a[], julong b[], julong n[],
3339                     julong m[], julong inv, int len) {
3340   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3341   int i;
3342 
3343   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3344 
3345   for (i = 0; i < len; i++) {
3346     int j;
3347     for (j = 0; j < i; j++) {
3348       MACC(a[j], b[i-j], t0, t1, t2);
3349       MACC(m[j], n[i-j], t0, t1, t2);
3350     }
3351     MACC(a[i], b[0], t0, t1, t2);
3352     m[i] = t0 * inv;
3353     MACC(m[i], n[0], t0, t1, t2);
3354 
3355     assert(t0 == 0, "broken Montgomery multiply");
3356 
3357     t0 = t1; t1 = t2; t2 = 0;
3358   }
3359 
3360   for (i = len; i < 2*len; i++) {
3361     int j;
3362     for (j = i-len+1; j < len; j++) {
3363       MACC(a[j], b[i-j], t0, t1, t2);
3364       MACC(m[j], n[i-j], t0, t1, t2);
3365     }
3366     m[i-len] = t0;
3367     t0 = t1; t1 = t2; t2 = 0;
3368   }
3369 
3370   while (t0)
3371     t0 = sub(m, n, t0, len);
3372 }
3373 
3374 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3375 // multiplies so it should be up to 25% faster than Montgomery
3376 // multiplication.  However, its loop control is more complex and it
3377 // may actually run slower on some machines.
3378 
3379 static void NOINLINE
3380 montgomery_square(julong a[], julong n[],
3381                   julong m[], julong inv, int len) {
3382   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3383   int i;
3384 
3385   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3386 
3387   for (i = 0; i < len; i++) {
3388     int j;
3389     int end = (i+1)/2;
3390     for (j = 0; j < end; j++) {
3391       MACC2(a[j], a[i-j], t0, t1, t2);
3392       MACC(m[j], n[i-j], t0, t1, t2);
3393     }
3394     if ((i & 1) == 0) {
3395       MACC(a[j], a[j], t0, t1, t2);
3396     }
3397     for (; j < i; j++) {
3398       MACC(m[j], n[i-j], t0, t1, t2);
3399     }
3400     m[i] = t0 * inv;
3401     MACC(m[i], n[0], t0, t1, t2);
3402 
3403     assert(t0 == 0, "broken Montgomery square");
3404 
3405     t0 = t1; t1 = t2; t2 = 0;
3406   }
3407 
3408   for (i = len; i < 2*len; i++) {
3409     int start = i-len+1;
3410     int end = start + (len - start)/2;
3411     int j;
3412     for (j = start; j < end; j++) {
3413       MACC2(a[j], a[i-j], t0, t1, t2);
3414       MACC(m[j], n[i-j], t0, t1, t2);
3415     }
3416     if ((i & 1) == 0) {
3417       MACC(a[j], a[j], t0, t1, t2);
3418     }
3419     for (; j < len; j++) {
3420       MACC(m[j], n[i-j], t0, t1, t2);
3421     }
3422     m[i-len] = t0;
3423     t0 = t1; t1 = t2; t2 = 0;
3424   }
3425 
3426   while (t0)
3427     t0 = sub(m, n, t0, len);
3428 }
3429 
3430 // Swap words in a longword.
3431 static julong swap(julong x) {
3432   return (x << 32) | (x >> 32);
3433 }
3434 
3435 // Copy len longwords from s to d, word-swapping as we go.  The
3436 // destination array is reversed.
3437 static void reverse_words(julong *s, julong *d, int len) {
3438   d += len;
3439   while(len-- > 0) {
3440     d--;
3441     *d = swap(*s);
3442     s++;
3443   }
3444 }
3445 
3446 // The threshold at which squaring is advantageous was determined
3447 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3448 #define MONTGOMERY_SQUARING_THRESHOLD 64
3449 
3450 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3451                                         jint len, jlong inv,
3452                                         jint *m_ints) {
3453   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3454   int longwords = len/2;
3455 
3456   // Make very sure we don't use so much space that the stack might
3457   // overflow.  512 jints corresponds to an 16384-bit integer and
3458   // will use here a total of 8k bytes of stack space.
3459   int divisor = sizeof(julong) * 4;
3460   guarantee(longwords <= 8192 / divisor, "must be");
3461   int total_allocation = longwords * sizeof (julong) * 4;
3462   julong *scratch = (julong *)alloca(total_allocation);
3463 
3464   // Local scratch arrays
3465   julong
3466     *a = scratch + 0 * longwords,
3467     *b = scratch + 1 * longwords,
3468     *n = scratch + 2 * longwords,
3469     *m = scratch + 3 * longwords;
3470 
3471   reverse_words((julong *)a_ints, a, longwords);
3472   reverse_words((julong *)b_ints, b, longwords);
3473   reverse_words((julong *)n_ints, n, longwords);
3474 
3475   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3476 
3477   reverse_words(m, (julong *)m_ints, longwords);
3478 }
3479 
3480 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3481                                       jint len, jlong inv,
3482                                       jint *m_ints) {
3483   assert(len % 2 == 0, "array length in montgomery_square must be even");
3484   int longwords = len/2;
3485 
3486   // Make very sure we don't use so much space that the stack might
3487   // overflow.  512 jints corresponds to an 16384-bit integer and
3488   // will use here a total of 6k bytes of stack space.
3489   int divisor = sizeof(julong) * 3;
3490   guarantee(longwords <= (8192 / divisor), "must be");
3491   int total_allocation = longwords * sizeof (julong) * 3;
3492   julong *scratch = (julong *)alloca(total_allocation);
3493 
3494   // Local scratch arrays
3495   julong
3496     *a = scratch + 0 * longwords,
3497     *n = scratch + 1 * longwords,
3498     *m = scratch + 2 * longwords;
3499 
3500   reverse_words((julong *)a_ints, a, longwords);
3501   reverse_words((julong *)n_ints, n, longwords);
3502 
3503   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3504     ::montgomery_square(a, n, m, (julong)inv, longwords);
3505   } else {
3506     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3507   }
3508 
3509   reverse_words(m, (julong *)m_ints, longwords);
3510 }
3511 
3512 #if INCLUDE_JFR
3513 
3514 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3515 // It returns a jobject handle to the event writer.
3516 // The handle is dereferenced and the return value is the event writer oop.
3517 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3518   enum layout {
3519     rbp_off,
3520     rbpH_off,
3521     return_off,
3522     return_off2,
3523     framesize // inclusive of return address
3524   };
3525 
3526   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3527   CodeBuffer code(name, 1024, 64);
3528   MacroAssembler* masm = new MacroAssembler(&code);
3529   address start = __ pc();
3530 
3531   __ enter();
3532   address the_pc = __ pc();
3533 
3534   int frame_complete = the_pc - start;
3535 
3536   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3537   __ movptr(c_rarg0, r15_thread);
3538   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3539   __ reset_last_Java_frame(true);
3540 
3541   // rax is jobject handle result, unpack and process it through a barrier.
3542   __ resolve_global_jobject(rax, c_rarg0);
3543 
3544   __ leave();
3545   __ ret(0);
3546 
3547   OopMapSet* oop_maps = new OopMapSet();
3548   OopMap* map = new OopMap(framesize, 1);
3549   oop_maps->add_gc_map(frame_complete, map);
3550 
3551   RuntimeStub* stub =
3552     RuntimeStub::new_runtime_stub(name,
3553                                   &code,
3554                                   frame_complete,
3555                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3556                                   oop_maps,
3557                                   false);
3558   return stub;
3559 }
3560 
3561 // For c2: call to return a leased buffer.
3562 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3563   enum layout {
3564     rbp_off,
3565     rbpH_off,
3566     return_off,
3567     return_off2,
3568     framesize // inclusive of return address
3569   };
3570 
3571   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
3572   CodeBuffer code(name, 1024, 64);
3573   MacroAssembler* masm = new MacroAssembler(&code);
3574   address start = __ pc();
3575 
3576   __ enter();
3577   address the_pc = __ pc();
3578 
3579   int frame_complete = the_pc - start;
3580 
3581   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3582   __ movptr(c_rarg0, r15_thread);
3583   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3584   __ reset_last_Java_frame(true);
3585 
3586   __ leave();
3587   __ ret(0);
3588 
3589   OopMapSet* oop_maps = new OopMapSet();
3590   OopMap* map = new OopMap(framesize, 1);
3591   oop_maps->add_gc_map(frame_complete, map);
3592 
3593   RuntimeStub* stub =
3594     RuntimeStub::new_runtime_stub(name,
3595                                   &code,
3596                                   frame_complete,
3597                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3598                                   oop_maps,
3599                                   false);
3600   return stub;
3601 }
3602 
3603 #endif // INCLUDE_JFR