1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "code/aotCodeCache.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/timerTrace.hpp"
  56 #include "runtime/vframeArray.hpp"
  57 #include "runtime/vm_version.hpp"
  58 #include "utilities/align.hpp"
  59 #include "utilities/checkedCast.hpp"
  60 #include "utilities/formatBuffer.hpp"
  61 #include "vmreg_x86.inline.hpp"
  62 #ifdef COMPILER1
  63 #include "c1/c1_Runtime1.hpp"
  64 #endif
  65 #ifdef COMPILER2
  66 #include "opto/runtime.hpp"
  67 #endif
  68 #if INCLUDE_JVMCI
  69 #include "jvmci/jvmciJavaClasses.hpp"
  70 #endif
  71 
  72 #define __ masm->
  73 
  74 #ifdef PRODUCT
  75 #define BLOCK_COMMENT(str) /* nothing */
  76 #else
  77 #define BLOCK_COMMENT(str) __ block_comment(str)
  78 #endif // PRODUCT
  79 
  80 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  81 
  82 class RegisterSaver {
  83   // Capture info about frame layout.  Layout offsets are in jint
  84   // units because compiler frame slots are jints.
  85 #define XSAVE_AREA_BEGIN 160
  86 #define XSAVE_AREA_YMM_BEGIN 576
  87 #define XSAVE_AREA_EGPRS 960
  88 #define XSAVE_AREA_OPMASK_BEGIN 1088
  89 #define XSAVE_AREA_ZMM_BEGIN 1152
  90 #define XSAVE_AREA_UPPERBANK 1664
  91 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  92 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  93 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  94 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  95 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  96   enum layout {
  97     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  98     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
  99     DEF_XMM_OFFS(0),
 100     DEF_XMM_OFFS(1),
 101     // 2..15 are implied in range usage
 102     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 103     DEF_YMM_OFFS(0),
 104     DEF_YMM_OFFS(1),
 105     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 106     r16H_off,
 107     r17_off, r17H_off,
 108     r18_off, r18H_off,
 109     r19_off, r19H_off,
 110     r20_off, r20H_off,
 111     r21_off, r21H_off,
 112     r22_off, r22H_off,
 113     r23_off, r23H_off,
 114     r24_off, r24H_off,
 115     r25_off, r25H_off,
 116     r26_off, r26H_off,
 117     r27_off, r27H_off,
 118     r28_off, r28H_off,
 119     r29_off, r29H_off,
 120     r30_off, r30H_off,
 121     r31_off, r31H_off,
 122     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 123     DEF_OPMASK_OFFS(0),
 124     DEF_OPMASK_OFFS(1),
 125     // 2..7 are implied in range usage
 126     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 127     DEF_ZMM_OFFS(0),
 128     DEF_ZMM_OFFS(1),
 129     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 130     DEF_ZMM_UPPER_OFFS(16),
 131     DEF_ZMM_UPPER_OFFS(17),
 132     // 18..31 are implied in range usage
 133     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 134     fpu_stateH_end,
 135     r15_off, r15H_off,
 136     r14_off, r14H_off,
 137     r13_off, r13H_off,
 138     r12_off, r12H_off,
 139     r11_off, r11H_off,
 140     r10_off, r10H_off,
 141     r9_off,  r9H_off,
 142     r8_off,  r8H_off,
 143     rdi_off, rdiH_off,
 144     rsi_off, rsiH_off,
 145     ignore_off, ignoreH_off,  // extra copy of rbp
 146     rsp_off, rspH_off,
 147     rbx_off, rbxH_off,
 148     rdx_off, rdxH_off,
 149     rcx_off, rcxH_off,
 150     rax_off, raxH_off,
 151     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 152     align_off, alignH_off,
 153     flags_off, flagsH_off,
 154     // The frame sender code expects that rbp will be in the "natural" place and
 155     // will override any oopMap setting for it. We must therefore force the layout
 156     // so that it agrees with the frame sender code.
 157     rbp_off, rbpH_off,        // copy of rbp we will restore
 158     return_off, returnH_off,  // slot for return address
 159     reg_save_size             // size in compiler stack slots
 160   };
 161 
 162  public:
 163   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 164   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 165 
 166   // Offsets into the register save area
 167   // Used by deoptimization when it is managing result register
 168   // values on its own
 169 
 170   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 171   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 172   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 173   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 174   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 175   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 176 
 177   // During deoptimization only the result registers need to be restored,
 178   // all the other values have already been extracted.
 179   static void restore_result_registers(MacroAssembler* masm);
 180 };
 181 
 182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 183   int off = 0;
 184   int num_xmm_regs = XMMRegister::available_xmm_registers();
 185 #if COMPILER2_OR_JVMCI
 186   if (save_wide_vectors && UseAVX == 0) {
 187     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 188   }
 189   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 190 #else
 191   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 192 #endif
 193 
 194   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 195   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 196   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 197   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 198   // CodeBlob frame size is in words.
 199   int frame_size_in_words = frame_size_in_bytes / wordSize;
 200   *total_frame_words = frame_size_in_words;
 201 
 202   // Save registers, fpu state, and flags.
 203   // We assume caller has already pushed the return address onto the
 204   // stack, so rsp is 8-byte aligned here.
 205   // We push rpb twice in this sequence because we want the real rbp
 206   // to be under the return like a normal enter.
 207 
 208   __ enter();          // rsp becomes 16-byte aligned here
 209   __ pushf();
 210   // Make sure rsp stays 16-byte aligned
 211   __ subq(rsp, 8);
 212   // Push CPU state in multiple of 16 bytes
 213   __ save_legacy_gprs();
 214   __ push_FPU_state();
 215 
 216 
 217   // push cpu state handles this on EVEX enabled targets
 218   if (save_wide_vectors) {
 219     // Save upper half of YMM registers(0..15)
 220     int base_addr = XSAVE_AREA_YMM_BEGIN;
 221     for (int n = 0; n < 16; n++) {
 222       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 223     }
 224     if (VM_Version::supports_evex()) {
 225       // Save upper half of ZMM registers(0..15)
 226       base_addr = XSAVE_AREA_ZMM_BEGIN;
 227       for (int n = 0; n < 16; n++) {
 228         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 229       }
 230       // Save full ZMM registers(16..num_xmm_regs)
 231       base_addr = XSAVE_AREA_UPPERBANK;
 232       off = 0;
 233       int vector_len = Assembler::AVX_512bit;
 234       for (int n = 16; n < num_xmm_regs; n++) {
 235         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 236       }
 237 #if COMPILER2_OR_JVMCI
 238       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 239       off = 0;
 240       for(int n = 0; n < KRegister::number_of_registers; n++) {
 241         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 242       }
 243 #endif
 244     }
 245   } else {
 246     if (VM_Version::supports_evex()) {
 247       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 248       int base_addr = XSAVE_AREA_UPPERBANK;
 249       off = 0;
 250       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 251       for (int n = 16; n < num_xmm_regs; n++) {
 252         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 253       }
 254 #if COMPILER2_OR_JVMCI
 255       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 256       off = 0;
 257       for(int n = 0; n < KRegister::number_of_registers; n++) {
 258         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 259       }
 260 #endif
 261     }
 262   }
 263 
 264 #if COMPILER2_OR_JVMCI
 265   if (UseAPX) {
 266       int base_addr = XSAVE_AREA_EGPRS;
 267       off = 0;
 268       for (int n = 16; n < Register::number_of_registers; n++) {
 269         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 270       }
 271   }
 272 #endif
 273 
 274   __ vzeroupper();
 275   if (frame::arg_reg_save_area_bytes != 0) {
 276     // Allocate argument register save area
 277     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 278   }
 279 
 280   // Set an oopmap for the call site.  This oopmap will map all
 281   // oop-registers and debug-info registers as callee-saved.  This
 282   // will allow deoptimization at this safepoint to find all possible
 283   // debug-info recordings, as well as let GC find all oops.
 284 
 285   OopMapSet *oop_maps = new OopMapSet();
 286   OopMap* map = new OopMap(frame_size_in_slots, 0);
 287 
 288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 289 
 290   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 291   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 294   // rbp location is known implicitly by the frame sender code, needs no oopmap
 295   // and the location where rbp was saved by is ignored
 296   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 297   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 306 
 307   if (UseAPX) {
 308     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 309     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 324   }
 325   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 326   // on EVEX enabled targets, we get it included in the xsave area
 327   off = xmm0_off;
 328   int delta = xmm1_off - off;
 329   for (int n = 0; n < 16; n++) {
 330     XMMRegister xmm_name = as_XMMRegister(n);
 331     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 332     off += delta;
 333   }
 334   if (UseAVX > 2) {
 335     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 336     off = zmm16_off;
 337     delta = zmm17_off - off;
 338     for (int n = 16; n < num_xmm_regs; n++) {
 339       XMMRegister zmm_name = as_XMMRegister(n);
 340       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 341       off += delta;
 342     }
 343   }
 344 
 345 #if COMPILER2_OR_JVMCI
 346   if (save_wide_vectors) {
 347     // Save upper half of YMM registers(0..15)
 348     off = ymm0_off;
 349     delta = ymm1_off - ymm0_off;
 350     for (int n = 0; n < 16; n++) {
 351       XMMRegister ymm_name = as_XMMRegister(n);
 352       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 353       off += delta;
 354     }
 355     if (VM_Version::supports_evex()) {
 356       // Save upper half of ZMM registers(0..15)
 357       off = zmm0_off;
 358       delta = zmm1_off - zmm0_off;
 359       for (int n = 0; n < 16; n++) {
 360         XMMRegister zmm_name = as_XMMRegister(n);
 361         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 362         off += delta;
 363       }
 364     }
 365   }
 366 #endif // COMPILER2_OR_JVMCI
 367 
 368   // %%% These should all be a waste but we'll keep things as they were for now
 369   if (true) {
 370     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 371     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 374     // rbp location is known implicitly by the frame sender code, needs no oopmap
 375     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 376     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 385     if (UseAPX) {
 386       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 387       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 402     }
 403     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 404     // on EVEX enabled targets, we get it included in the xsave area
 405     off = xmm0H_off;
 406     delta = xmm1H_off - off;
 407     for (int n = 0; n < 16; n++) {
 408       XMMRegister xmm_name = as_XMMRegister(n);
 409       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 410       off += delta;
 411     }
 412     if (UseAVX > 2) {
 413       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 414       off = zmm16H_off;
 415       delta = zmm17H_off - off;
 416       for (int n = 16; n < num_xmm_regs; n++) {
 417         XMMRegister zmm_name = as_XMMRegister(n);
 418         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 419         off += delta;
 420       }
 421     }
 422   }
 423 
 424   return map;
 425 }
 426 
 427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 428   int num_xmm_regs = XMMRegister::available_xmm_registers();
 429   if (frame::arg_reg_save_area_bytes != 0) {
 430     // Pop arg register save area
 431     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 432   }
 433 
 434 #if COMPILER2_OR_JVMCI
 435   if (restore_wide_vectors) {
 436     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 437     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 438   }
 439 #else
 440   assert(!restore_wide_vectors, "vectors are generated only by C2");
 441 #endif
 442 
 443   __ vzeroupper();
 444 
 445   // On EVEX enabled targets everything is handled in pop fpu state
 446   if (restore_wide_vectors) {
 447     // Restore upper half of YMM registers (0..15)
 448     int base_addr = XSAVE_AREA_YMM_BEGIN;
 449     for (int n = 0; n < 16; n++) {
 450       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 451     }
 452     if (VM_Version::supports_evex()) {
 453       // Restore upper half of ZMM registers (0..15)
 454       base_addr = XSAVE_AREA_ZMM_BEGIN;
 455       for (int n = 0; n < 16; n++) {
 456         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 457       }
 458       // Restore full ZMM registers(16..num_xmm_regs)
 459       base_addr = XSAVE_AREA_UPPERBANK;
 460       int vector_len = Assembler::AVX_512bit;
 461       int off = 0;
 462       for (int n = 16; n < num_xmm_regs; n++) {
 463         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 464       }
 465 #if COMPILER2_OR_JVMCI
 466       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 467       off = 0;
 468       for (int n = 0; n < KRegister::number_of_registers; n++) {
 469         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 470       }
 471 #endif
 472     }
 473   } else {
 474     if (VM_Version::supports_evex()) {
 475       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 476       int base_addr = XSAVE_AREA_UPPERBANK;
 477       int off = 0;
 478       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 479       for (int n = 16; n < num_xmm_regs; n++) {
 480         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 481       }
 482 #if COMPILER2_OR_JVMCI
 483       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 484       off = 0;
 485       for (int n = 0; n < KRegister::number_of_registers; n++) {
 486         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 487       }
 488 #endif
 489     }
 490   }
 491 
 492 #if COMPILER2_OR_JVMCI
 493   if (UseAPX) {
 494     int base_addr = XSAVE_AREA_EGPRS;
 495     int off = 0;
 496     for (int n = 16; n < Register::number_of_registers; n++) {
 497       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 498     }
 499   }
 500 #endif
 501 
 502   // Recover CPU state
 503   __ pop_FPU_state();
 504   __ restore_legacy_gprs();
 505   __ addq(rsp, 8);
 506   __ popf();
 507   // Get the rbp described implicitly by the calling convention (no oopMap)
 508   __ pop(rbp);
 509 }
 510 
 511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 512 
 513   // Just restore result register. Only used by deoptimization. By
 514   // now any callee save register that needs to be restored to a c2
 515   // caller of the deoptee has been extracted into the vframeArray
 516   // and will be stuffed into the c2i adapter we create for later
 517   // restoration so only result registers need to be restored here.
 518 
 519   // Restore fp result register
 520   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 521   // Restore integer result register
 522   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 523   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 524 
 525   // Pop all of the register save are off the stack except the return address
 526   __ addptr(rsp, return_offset_in_bytes());
 527 }
 528 
 529 // Is vector's size (in bytes) bigger than a size saved by default?
 530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 531 bool SharedRuntime::is_wide_vector(int size) {
 532   return size > 16;
 533 }
 534 
 535 // ---------------------------------------------------------------------------
 536 // Read the array of BasicTypes from a signature, and compute where the
 537 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 538 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 539 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 540 // as framesizes are fixed.
 541 // VMRegImpl::stack0 refers to the first slot 0(sp).
 542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 543 // Register up to Register::number_of_registers are the 64-bit
 544 // integer registers.
 545 
 546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 547 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 548 // units regardless of build. Of course for i486 there is no 64 bit build
 549 
 550 // The Java calling convention is a "shifted" version of the C ABI.
 551 // By skipping the first C ABI register we can call non-static jni methods
 552 // with small numbers of arguments without having to shuffle the arguments
 553 // at all. Since we control the java ABI we ought to at least get some
 554 // advantage out of it.
 555 
 556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 557                                            VMRegPair *regs,
 558                                            int total_args_passed) {
 559 
 560   // Create the mapping between argument positions and
 561   // registers.
 562   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 563     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 564   };
 565   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 566     j_farg0, j_farg1, j_farg2, j_farg3,
 567     j_farg4, j_farg5, j_farg6, j_farg7
 568   };
 569 
 570 
 571   uint int_args = 0;
 572   uint fp_args = 0;
 573   uint stk_args = 0;
 574 
 575   for (int i = 0; i < total_args_passed; i++) {
 576     switch (sig_bt[i]) {
 577     case T_BOOLEAN:
 578     case T_CHAR:
 579     case T_BYTE:
 580     case T_SHORT:
 581     case T_INT:
 582       if (int_args < Argument::n_int_register_parameters_j) {
 583         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 584       } else {
 585         stk_args = align_up(stk_args, 2);
 586         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 587         stk_args += 1;
 588       }
 589       break;
 590     case T_VOID:
 591       // halves of T_LONG or T_DOUBLE
 592       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 593       regs[i].set_bad();
 594       break;
 595     case T_LONG:
 596       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 597       // fall through
 598     case T_OBJECT:
 599     case T_ARRAY:
 600     case T_ADDRESS:
 601       if (int_args < Argument::n_int_register_parameters_j) {
 602         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 603       } else {
 604         stk_args = align_up(stk_args, 2);
 605         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 606         stk_args += 2;
 607       }
 608       break;
 609     case T_FLOAT:
 610       if (fp_args < Argument::n_float_register_parameters_j) {
 611         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 612       } else {
 613         stk_args = align_up(stk_args, 2);
 614         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 615         stk_args += 1;
 616       }
 617       break;
 618     case T_DOUBLE:
 619       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 620       if (fp_args < Argument::n_float_register_parameters_j) {
 621         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 622       } else {
 623         stk_args = align_up(stk_args, 2);
 624         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 625         stk_args += 2;
 626       }
 627       break;
 628     default:
 629       ShouldNotReachHere();
 630       break;
 631     }
 632   }
 633 
 634   return stk_args;
 635 }
 636 
 637 // Patch the callers callsite with entry to compiled code if it exists.
 638 static void patch_callers_callsite(MacroAssembler *masm) {
 639   Label L;
 640   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 641   __ jcc(Assembler::equal, L);
 642 
 643   // Save the current stack pointer
 644   __ mov(r13, rsp);
 645   // Schedule the branch target address early.
 646   // Call into the VM to patch the caller, then jump to compiled callee
 647   // rax isn't live so capture return address while we easily can
 648   __ movptr(rax, Address(rsp, 0));
 649 
 650   // align stack so push_CPU_state doesn't fault
 651   __ andptr(rsp, -(StackAlignmentInBytes));
 652   __ push_CPU_state();
 653   __ vzeroupper();
 654   // VM needs caller's callsite
 655   // VM needs target method
 656   // This needs to be a long call since we will relocate this adapter to
 657   // the codeBuffer and it may not reach
 658 
 659   // Allocate argument register save area
 660   if (frame::arg_reg_save_area_bytes != 0) {
 661     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 662   }
 663   __ mov(c_rarg0, rbx);
 664   __ mov(c_rarg1, rax);
 665   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 666 
 667   // De-allocate argument register save area
 668   if (frame::arg_reg_save_area_bytes != 0) {
 669     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 670   }
 671 
 672   __ vzeroupper();
 673   __ pop_CPU_state();
 674   // restore sp
 675   __ mov(rsp, r13);
 676   __ bind(L);
 677 }
 678 
 679 static void gen_c2i_adapter(MacroAssembler *masm,
 680                             int total_args_passed,
 681                             int comp_args_on_stack,
 682                             const BasicType *sig_bt,
 683                             const VMRegPair *regs,
 684                             Label& skip_fixup) {
 685   // Before we get into the guts of the C2I adapter, see if we should be here
 686   // at all.  We've come from compiled code and are attempting to jump to the
 687   // interpreter, which means the caller made a static call to get here
 688   // (vcalls always get a compiled target if there is one).  Check for a
 689   // compiled target.  If there is one, we need to patch the caller's call.
 690   patch_callers_callsite(masm);
 691 
 692   __ bind(skip_fixup);
 693 
 694   // Since all args are passed on the stack, total_args_passed *
 695   // Interpreter::stackElementSize is the space we need.
 696 
 697   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 698 
 699   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 700 
 701   // stack is aligned, keep it that way
 702   // This is not currently needed or enforced by the interpreter, but
 703   // we might as well conform to the ABI.
 704   extraspace = align_up(extraspace, 2*wordSize);
 705 
 706   // set senderSP value
 707   __ lea(r13, Address(rsp, wordSize));
 708 
 709 #ifdef ASSERT
 710   __ check_stack_alignment(r13, "sender stack not aligned");
 711 #endif
 712   if (extraspace > 0) {
 713     // Pop the return address
 714     __ pop(rax);
 715 
 716     __ subptr(rsp, extraspace);
 717 
 718     // Push the return address
 719     __ push(rax);
 720 
 721     // Account for the return address location since we store it first rather
 722     // than hold it in a register across all the shuffling
 723     extraspace += wordSize;
 724   }
 725 
 726 #ifdef ASSERT
 727   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 728 #endif
 729 
 730   // Now write the args into the outgoing interpreter space
 731   for (int i = 0; i < total_args_passed; i++) {
 732     if (sig_bt[i] == T_VOID) {
 733       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 734       continue;
 735     }
 736 
 737     // offset to start parameters
 738     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 739     int next_off = st_off - Interpreter::stackElementSize;
 740 
 741     // Say 4 args:
 742     // i   st_off
 743     // 0   32 T_LONG
 744     // 1   24 T_VOID
 745     // 2   16 T_OBJECT
 746     // 3    8 T_BOOL
 747     // -    0 return address
 748     //
 749     // However to make thing extra confusing. Because we can fit a long/double in
 750     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 751     // leaves one slot empty and only stores to a single slot. In this case the
 752     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 753 
 754     VMReg r_1 = regs[i].first();
 755     VMReg r_2 = regs[i].second();
 756     if (!r_1->is_valid()) {
 757       assert(!r_2->is_valid(), "");
 758       continue;
 759     }
 760     if (r_1->is_stack()) {
 761       // memory to memory use rax
 762       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 763       if (!r_2->is_valid()) {
 764         // sign extend??
 765         __ movl(rax, Address(rsp, ld_off));
 766         __ movptr(Address(rsp, st_off), rax);
 767 
 768       } else {
 769 
 770         __ movq(rax, Address(rsp, ld_off));
 771 
 772         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 773         // T_DOUBLE and T_LONG use two slots in the interpreter
 774         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 775           // ld_off == LSW, ld_off+wordSize == MSW
 776           // st_off == MSW, next_off == LSW
 777           __ movq(Address(rsp, next_off), rax);
 778 #ifdef ASSERT
 779           // Overwrite the unused slot with known junk
 780           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 781           __ movptr(Address(rsp, st_off), rax);
 782 #endif /* ASSERT */
 783         } else {
 784           __ movq(Address(rsp, st_off), rax);
 785         }
 786       }
 787     } else if (r_1->is_Register()) {
 788       Register r = r_1->as_Register();
 789       if (!r_2->is_valid()) {
 790         // must be only an int (or less ) so move only 32bits to slot
 791         // why not sign extend??
 792         __ movl(Address(rsp, st_off), r);
 793       } else {
 794         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 795         // T_DOUBLE and T_LONG use two slots in the interpreter
 796         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 797           // long/double in gpr
 798 #ifdef ASSERT
 799           // Overwrite the unused slot with known junk
 800           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 801           __ movptr(Address(rsp, st_off), rax);
 802 #endif /* ASSERT */
 803           __ movq(Address(rsp, next_off), r);
 804         } else {
 805           __ movptr(Address(rsp, st_off), r);
 806         }
 807       }
 808     } else {
 809       assert(r_1->is_XMMRegister(), "");
 810       if (!r_2->is_valid()) {
 811         // only a float use just part of the slot
 812         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 813       } else {
 814 #ifdef ASSERT
 815         // Overwrite the unused slot with known junk
 816         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 817         __ movptr(Address(rsp, st_off), rax);
 818 #endif /* ASSERT */
 819         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 820       }
 821     }
 822   }
 823 
 824   // Schedule the branch target address early.
 825   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 826   __ jmp(rcx);
 827 }
 828 
 829 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 830                                     int total_args_passed,
 831                                     int comp_args_on_stack,
 832                                     const BasicType *sig_bt,
 833                                     const VMRegPair *regs) {
 834 
 835   // Note: r13 contains the senderSP on entry. We must preserve it since
 836   // we may do a i2c -> c2i transition if we lose a race where compiled
 837   // code goes non-entrant while we get args ready.
 838   // In addition we use r13 to locate all the interpreter args as
 839   // we must align the stack to 16 bytes on an i2c entry else we
 840   // lose alignment we expect in all compiled code and register
 841   // save code can segv when fxsave instructions find improperly
 842   // aligned stack pointer.
 843 
 844   // Adapters can be frameless because they do not require the caller
 845   // to perform additional cleanup work, such as correcting the stack pointer.
 846   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 847   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 848   // even if a callee has modified the stack pointer.
 849   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 850   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 851   // up via the senderSP register).
 852   // In other words, if *either* the caller or callee is interpreted, we can
 853   // get the stack pointer repaired after a call.
 854   // This is why c2i and i2c adapters cannot be indefinitely composed.
 855   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 856   // both caller and callee would be compiled methods, and neither would
 857   // clean up the stack pointer changes performed by the two adapters.
 858   // If this happens, control eventually transfers back to the compiled
 859   // caller, but with an uncorrected stack, causing delayed havoc.
 860 
 861   // Must preserve original SP for loading incoming arguments because
 862   // we need to align the outgoing SP for compiled code.
 863   __ movptr(r11, rsp);
 864 
 865   // Pick up the return address
 866   __ pop(rax);
 867 
 868   // Convert 4-byte c2 stack slots to words.
 869   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 870 
 871   if (comp_args_on_stack) {
 872     __ subptr(rsp, comp_words_on_stack * wordSize);
 873   }
 874 
 875   // Ensure compiled code always sees stack at proper alignment
 876   __ andptr(rsp, -16);
 877 
 878   // push the return address and misalign the stack that youngest frame always sees
 879   // as far as the placement of the call instruction
 880   __ push(rax);
 881 
 882   // Put saved SP in another register
 883   const Register saved_sp = rax;
 884   __ movptr(saved_sp, r11);
 885 
 886   // Will jump to the compiled code just as if compiled code was doing it.
 887   // Pre-load the register-jump target early, to schedule it better.
 888   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 889 
 890 #if INCLUDE_JVMCI
 891   if (EnableJVMCI) {
 892     // check if this call should be routed towards a specific entry point
 893     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 894     Label no_alternative_target;
 895     __ jcc(Assembler::equal, no_alternative_target);
 896     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 897     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 898     __ bind(no_alternative_target);
 899   }
 900 #endif // INCLUDE_JVMCI
 901 
 902   // Now generate the shuffle code.  Pick up all register args and move the
 903   // rest through the floating point stack top.
 904   for (int i = 0; i < total_args_passed; i++) {
 905     if (sig_bt[i] == T_VOID) {
 906       // Longs and doubles are passed in native word order, but misaligned
 907       // in the 32-bit build.
 908       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 909       continue;
 910     }
 911 
 912     // Pick up 0, 1 or 2 words from SP+offset.
 913 
 914     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 915             "scrambled load targets?");
 916     // Load in argument order going down.
 917     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 918     // Point to interpreter value (vs. tag)
 919     int next_off = ld_off - Interpreter::stackElementSize;
 920     //
 921     //
 922     //
 923     VMReg r_1 = regs[i].first();
 924     VMReg r_2 = regs[i].second();
 925     if (!r_1->is_valid()) {
 926       assert(!r_2->is_valid(), "");
 927       continue;
 928     }
 929     if (r_1->is_stack()) {
 930       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 931       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 932 
 933       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 934       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 935       // will be generated.
 936       if (!r_2->is_valid()) {
 937         // sign extend???
 938         __ movl(r13, Address(saved_sp, ld_off));
 939         __ movptr(Address(rsp, st_off), r13);
 940       } else {
 941         //
 942         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 943         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 944         // So we must adjust where to pick up the data to match the interpreter.
 945         //
 946         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 947         // are accessed as negative so LSW is at LOW address
 948 
 949         // ld_off is MSW so get LSW
 950         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 951                            next_off : ld_off;
 952         __ movq(r13, Address(saved_sp, offset));
 953         // st_off is LSW (i.e. reg.first())
 954         __ movq(Address(rsp, st_off), r13);
 955       }
 956     } else if (r_1->is_Register()) {  // Register argument
 957       Register r = r_1->as_Register();
 958       assert(r != rax, "must be different");
 959       if (r_2->is_valid()) {
 960         //
 961         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 962         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 963         // So we must adjust where to pick up the data to match the interpreter.
 964 
 965         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 966                            next_off : ld_off;
 967 
 968         // this can be a misaligned move
 969         __ movq(r, Address(saved_sp, offset));
 970       } else {
 971         // sign extend and use a full word?
 972         __ movl(r, Address(saved_sp, ld_off));
 973       }
 974     } else {
 975       if (!r_2->is_valid()) {
 976         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 977       } else {
 978         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 979       }
 980     }
 981   }
 982 
 983   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 984 
 985   // 6243940 We might end up in handle_wrong_method if
 986   // the callee is deoptimized as we race thru here. If that
 987   // happens we don't want to take a safepoint because the
 988   // caller frame will look interpreted and arguments are now
 989   // "compiled" so it is much better to make this transition
 990   // invisible to the stack walking code. Unfortunately if
 991   // we try and find the callee by normal means a safepoint
 992   // is possible. So we stash the desired callee in the thread
 993   // and the vm will find there should this case occur.
 994 
 995   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 996 
 997   // put Method* where a c2i would expect should we end up there
 998   // only needed because eof c2 resolve stubs return Method* as a result in
 999   // rax
1000   __ mov(rax, rbx);
1001   __ jmp(r11);
1002 }
1003 
1004 // ---------------------------------------------------------------
1005 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1006                                             int total_args_passed,
1007                                             int comp_args_on_stack,
1008                                             const BasicType *sig_bt,
1009                                             const VMRegPair *regs,
1010                                             AdapterHandlerEntry* handler) {
1011   address i2c_entry = __ pc();
1012 
1013   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1014 
1015   // -------------------------------------------------------------------------
1016   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1017   // to the interpreter.  The args start out packed in the compiled layout.  They
1018   // need to be unpacked into the interpreter layout.  This will almost always
1019   // require some stack space.  We grow the current (compiled) stack, then repack
1020   // the args.  We  finally end in a jump to the generic interpreter entry point.
1021   // On exit from the interpreter, the interpreter will restore our SP (lest the
1022   // compiled code, which relies solely on SP and not RBP, get sick).
1023 
1024   address c2i_unverified_entry = __ pc();
1025   Label skip_fixup;
1026 
1027   Register data = rax;
1028   Register receiver = j_rarg0;
1029   Register temp = rbx;
1030 
1031   {
1032     __ ic_check(1 /* end_alignment */);
1033     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1034     // Method might have been compiled since the call site was patched to
1035     // interpreted if that is the case treat it as a miss so we can get
1036     // the call site corrected.
1037     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1038     __ jcc(Assembler::equal, skip_fixup);
1039     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1040   }
1041 
1042   address c2i_entry = __ pc();
1043 
1044   // Class initialization barrier for static methods
1045   address c2i_no_clinit_check_entry = nullptr;
1046   if (VM_Version::supports_fast_class_init_checks()) {
1047     Label L_skip_barrier;
1048     Register method = rbx;
1049 
1050     { // Bypass the barrier for non-static methods
1051       Register flags = rscratch1;
1052       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1053       __ testl(flags, JVM_ACC_STATIC);
1054       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1055     }
1056 
1057     Register klass = rscratch1;
1058     __ load_method_holder(klass, method);
1059     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1060 
1061     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1062 
1063     __ bind(L_skip_barrier);
1064     c2i_no_clinit_check_entry = __ pc();
1065   }
1066 
1067   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1068   bs->c2i_entry_barrier(masm);
1069 
1070   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1071 
1072   handler->set_entry_points(i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1073   return;
1074 }
1075 
1076 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1077                                          VMRegPair *regs,
1078                                          int total_args_passed) {
1079 
1080 // We return the amount of VMRegImpl stack slots we need to reserve for all
1081 // the arguments NOT counting out_preserve_stack_slots.
1082 
1083 // NOTE: These arrays will have to change when c1 is ported
1084 #ifdef _WIN64
1085     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1086       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1087     };
1088     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1089       c_farg0, c_farg1, c_farg2, c_farg3
1090     };
1091 #else
1092     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1093       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1094     };
1095     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1096       c_farg0, c_farg1, c_farg2, c_farg3,
1097       c_farg4, c_farg5, c_farg6, c_farg7
1098     };
1099 #endif // _WIN64
1100 
1101 
1102     uint int_args = 0;
1103     uint fp_args = 0;
1104     uint stk_args = 0; // inc by 2 each time
1105 
1106     for (int i = 0; i < total_args_passed; i++) {
1107       switch (sig_bt[i]) {
1108       case T_BOOLEAN:
1109       case T_CHAR:
1110       case T_BYTE:
1111       case T_SHORT:
1112       case T_INT:
1113         if (int_args < Argument::n_int_register_parameters_c) {
1114           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1115 #ifdef _WIN64
1116           fp_args++;
1117           // Allocate slots for callee to stuff register args the stack.
1118           stk_args += 2;
1119 #endif
1120         } else {
1121           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1122           stk_args += 2;
1123         }
1124         break;
1125       case T_LONG:
1126         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1127         // fall through
1128       case T_OBJECT:
1129       case T_ARRAY:
1130       case T_ADDRESS:
1131       case T_METADATA:
1132         if (int_args < Argument::n_int_register_parameters_c) {
1133           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1134 #ifdef _WIN64
1135           fp_args++;
1136           stk_args += 2;
1137 #endif
1138         } else {
1139           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1140           stk_args += 2;
1141         }
1142         break;
1143       case T_FLOAT:
1144         if (fp_args < Argument::n_float_register_parameters_c) {
1145           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1146 #ifdef _WIN64
1147           int_args++;
1148           // Allocate slots for callee to stuff register args the stack.
1149           stk_args += 2;
1150 #endif
1151         } else {
1152           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1153           stk_args += 2;
1154         }
1155         break;
1156       case T_DOUBLE:
1157         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1158         if (fp_args < Argument::n_float_register_parameters_c) {
1159           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1160 #ifdef _WIN64
1161           int_args++;
1162           // Allocate slots for callee to stuff register args the stack.
1163           stk_args += 2;
1164 #endif
1165         } else {
1166           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1167           stk_args += 2;
1168         }
1169         break;
1170       case T_VOID: // Halves of longs and doubles
1171         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1172         regs[i].set_bad();
1173         break;
1174       default:
1175         ShouldNotReachHere();
1176         break;
1177       }
1178     }
1179 #ifdef _WIN64
1180   // windows abi requires that we always allocate enough stack space
1181   // for 4 64bit registers to be stored down.
1182   if (stk_args < 8) {
1183     stk_args = 8;
1184   }
1185 #endif // _WIN64
1186 
1187   return stk_args;
1188 }
1189 
1190 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1191                                              uint num_bits,
1192                                              uint total_args_passed) {
1193   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1194          "only certain vector sizes are supported for now");
1195 
1196   static const XMMRegister VEC_ArgReg[32] = {
1197      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1198      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1199     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1200     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1201   };
1202 
1203   uint stk_args = 0;
1204   uint fp_args = 0;
1205 
1206   for (uint i = 0; i < total_args_passed; i++) {
1207     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1208     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1209     regs[i].set_pair(vmreg->next(next_val), vmreg);
1210   }
1211 
1212   return stk_args;
1213 }
1214 
1215 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1216   // We always ignore the frame_slots arg and just use the space just below frame pointer
1217   // which by this time is free to use
1218   switch (ret_type) {
1219   case T_FLOAT:
1220     __ movflt(Address(rbp, -wordSize), xmm0);
1221     break;
1222   case T_DOUBLE:
1223     __ movdbl(Address(rbp, -wordSize), xmm0);
1224     break;
1225   case T_VOID:  break;
1226   default: {
1227     __ movptr(Address(rbp, -wordSize), rax);
1228     }
1229   }
1230 }
1231 
1232 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1233   // We always ignore the frame_slots arg and just use the space just below frame pointer
1234   // which by this time is free to use
1235   switch (ret_type) {
1236   case T_FLOAT:
1237     __ movflt(xmm0, Address(rbp, -wordSize));
1238     break;
1239   case T_DOUBLE:
1240     __ movdbl(xmm0, Address(rbp, -wordSize));
1241     break;
1242   case T_VOID:  break;
1243   default: {
1244     __ movptr(rax, Address(rbp, -wordSize));
1245     }
1246   }
1247 }
1248 
1249 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1250     for ( int i = first_arg ; i < arg_count ; i++ ) {
1251       if (args[i].first()->is_Register()) {
1252         __ push(args[i].first()->as_Register());
1253       } else if (args[i].first()->is_XMMRegister()) {
1254         __ subptr(rsp, 2*wordSize);
1255         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1256       }
1257     }
1258 }
1259 
1260 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1261     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1262       if (args[i].first()->is_Register()) {
1263         __ pop(args[i].first()->as_Register());
1264       } else if (args[i].first()->is_XMMRegister()) {
1265         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1266         __ addptr(rsp, 2*wordSize);
1267       }
1268     }
1269 }
1270 
1271 static void verify_oop_args(MacroAssembler* masm,
1272                             const methodHandle& method,
1273                             const BasicType* sig_bt,
1274                             const VMRegPair* regs) {
1275   Register temp_reg = rbx;  // not part of any compiled calling seq
1276   if (VerifyOops) {
1277     for (int i = 0; i < method->size_of_parameters(); i++) {
1278       if (is_reference_type(sig_bt[i])) {
1279         VMReg r = regs[i].first();
1280         assert(r->is_valid(), "bad oop arg");
1281         if (r->is_stack()) {
1282           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1283           __ verify_oop(temp_reg);
1284         } else {
1285           __ verify_oop(r->as_Register());
1286         }
1287       }
1288     }
1289   }
1290 }
1291 
1292 static void check_continuation_enter_argument(VMReg actual_vmreg,
1293                                               Register expected_reg,
1294                                               const char* name) {
1295   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1296   assert(actual_vmreg->as_Register() == expected_reg,
1297          "%s is in unexpected register: %s instead of %s",
1298          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1299 }
1300 
1301 
1302 //---------------------------- continuation_enter_setup ---------------------------
1303 //
1304 // Arguments:
1305 //   None.
1306 //
1307 // Results:
1308 //   rsp: pointer to blank ContinuationEntry
1309 //
1310 // Kills:
1311 //   rax
1312 //
1313 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1314   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1315   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1316   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1317 
1318   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1319   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1320 
1321   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1322   OopMap* map = new OopMap(frame_size, 0);
1323 
1324   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1325   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1326   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1327 
1328   return map;
1329 }
1330 
1331 //---------------------------- fill_continuation_entry ---------------------------
1332 //
1333 // Arguments:
1334 //   rsp: pointer to blank Continuation entry
1335 //   reg_cont_obj: pointer to the continuation
1336 //   reg_flags: flags
1337 //
1338 // Results:
1339 //   rsp: pointer to filled out ContinuationEntry
1340 //
1341 // Kills:
1342 //   rax
1343 //
1344 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1345   assert_different_registers(rax, reg_cont_obj, reg_flags);
1346 #ifdef ASSERT
1347   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1348 #endif
1349   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1350   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1351   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1352   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1353   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1354 
1355   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1356   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1357   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1358   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1359 
1360   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1361   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1362 }
1363 
1364 //---------------------------- continuation_enter_cleanup ---------------------------
1365 //
1366 // Arguments:
1367 //   rsp: pointer to the ContinuationEntry
1368 //
1369 // Results:
1370 //   rsp: pointer to the spilled rbp in the entry frame
1371 //
1372 // Kills:
1373 //   rbx
1374 //
1375 static void continuation_enter_cleanup(MacroAssembler* masm) {
1376 #ifdef ASSERT
1377   Label L_good_sp;
1378   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1379   __ jcc(Assembler::equal, L_good_sp);
1380   __ stop("Incorrect rsp at continuation_enter_cleanup");
1381   __ bind(L_good_sp);
1382 #endif
1383   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1384   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1385 
1386   if (CheckJNICalls) {
1387     // Check if this is a virtual thread continuation
1388     Label L_skip_vthread_code;
1389     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1390     __ jcc(Assembler::equal, L_skip_vthread_code);
1391 
1392     // If the held monitor count is > 0 and this vthread is terminating then
1393     // it failed to release a JNI monitor. So we issue the same log message
1394     // that JavaThread::exit does.
1395     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1396     __ jcc(Assembler::equal, L_skip_vthread_code);
1397 
1398     // rax may hold an exception oop, save it before the call
1399     __ push(rax);
1400     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1401     __ pop(rax);
1402 
1403     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1404     // on termination. The held count is implicitly zeroed below when we restore from
1405     // the parent held count (which has to be zero).
1406     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1407 
1408     __ bind(L_skip_vthread_code);
1409   }
1410 #ifdef ASSERT
1411   else {
1412     // Check if this is a virtual thread continuation
1413     Label L_skip_vthread_code;
1414     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1415     __ jcc(Assembler::equal, L_skip_vthread_code);
1416 
1417     // See comment just above. If not checking JNI calls the JNI count is only
1418     // needed for assertion checking.
1419     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1420 
1421     __ bind(L_skip_vthread_code);
1422   }
1423 #endif
1424 
1425   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1426   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1427 
1428   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1429   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1430   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1431 }
1432 
1433 static void gen_continuation_enter(MacroAssembler* masm,
1434                                    const VMRegPair* regs,
1435                                    int& exception_offset,
1436                                    OopMapSet* oop_maps,
1437                                    int& frame_complete,
1438                                    int& stack_slots,
1439                                    int& interpreted_entry_offset,
1440                                    int& compiled_entry_offset) {
1441 
1442   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1443   int pos_cont_obj   = 0;
1444   int pos_is_cont    = 1;
1445   int pos_is_virtual = 2;
1446 
1447   // The platform-specific calling convention may present the arguments in various registers.
1448   // To simplify the rest of the code, we expect the arguments to reside at these known
1449   // registers, and we additionally check the placement here in case calling convention ever
1450   // changes.
1451   Register reg_cont_obj   = c_rarg1;
1452   Register reg_is_cont    = c_rarg2;
1453   Register reg_is_virtual = c_rarg3;
1454 
1455   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1456   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1457   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1458 
1459   // Utility methods kill rax, make sure there are no collisions
1460   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1461 
1462   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1463                          relocInfo::static_call_type);
1464 
1465   address start = __ pc();
1466 
1467   Label L_thaw, L_exit;
1468 
1469   // i2i entry used at interp_only_mode only
1470   interpreted_entry_offset = __ pc() - start;
1471   {
1472 #ifdef ASSERT
1473     Label is_interp_only;
1474     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1475     __ jcc(Assembler::notEqual, is_interp_only);
1476     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1477     __ bind(is_interp_only);
1478 #endif
1479 
1480     __ pop(rax); // return address
1481     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1482     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1483     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1484     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1485     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1486     __ push(rax); // return address
1487     __ push_cont_fastpath();
1488 
1489     __ enter();
1490 
1491     stack_slots = 2; // will be adjusted in setup
1492     OopMap* map = continuation_enter_setup(masm, stack_slots);
1493     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1494     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1495 
1496     __ verify_oop(reg_cont_obj);
1497 
1498     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1499 
1500     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1501     __ testptr(reg_is_cont, reg_is_cont);
1502     __ jcc(Assembler::notZero, L_thaw);
1503 
1504     // --- Resolve path
1505 
1506     // Make sure the call is patchable
1507     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1508     // Emit stub for static call
1509     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1510     if (stub == nullptr) {
1511       fatal("CodeCache is full at gen_continuation_enter");
1512     }
1513     __ call(resolve);
1514     oop_maps->add_gc_map(__ pc() - start, map);
1515     __ post_call_nop();
1516 
1517     __ jmp(L_exit);
1518   }
1519 
1520   // compiled entry
1521   __ align(CodeEntryAlignment);
1522   compiled_entry_offset = __ pc() - start;
1523   __ enter();
1524 
1525   stack_slots = 2; // will be adjusted in setup
1526   OopMap* map = continuation_enter_setup(masm, stack_slots);
1527 
1528   // Frame is now completed as far as size and linkage.
1529   frame_complete = __ pc() - start;
1530 
1531   __ verify_oop(reg_cont_obj);
1532 
1533   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1534 
1535   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1536   __ testptr(reg_is_cont, reg_is_cont);
1537   __ jccb(Assembler::notZero, L_thaw);
1538 
1539   // --- call Continuation.enter(Continuation c, boolean isContinue)
1540 
1541   // Make sure the call is patchable
1542   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1543 
1544   // Emit stub for static call
1545   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1546   if (stub == nullptr) {
1547     fatal("CodeCache is full at gen_continuation_enter");
1548   }
1549 
1550   // The call needs to be resolved. There's a special case for this in
1551   // SharedRuntime::find_callee_info_helper() which calls
1552   // LinkResolver::resolve_continuation_enter() which resolves the call to
1553   // Continuation.enter(Continuation c, boolean isContinue).
1554   __ call(resolve);
1555 
1556   oop_maps->add_gc_map(__ pc() - start, map);
1557   __ post_call_nop();
1558 
1559   __ jmpb(L_exit);
1560 
1561   // --- Thawing path
1562 
1563   __ bind(L_thaw);
1564 
1565   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1566   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1567 
1568   ContinuationEntry::_return_pc_offset = __ pc() - start;
1569   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1570   __ post_call_nop();
1571 
1572   // --- Normal exit (resolve/thawing)
1573 
1574   __ bind(L_exit);
1575   ContinuationEntry::_cleanup_offset = __ pc() - start;
1576   continuation_enter_cleanup(masm);
1577   __ pop(rbp);
1578   __ ret(0);
1579 
1580   // --- Exception handling path
1581 
1582   exception_offset = __ pc() - start;
1583 
1584   continuation_enter_cleanup(masm);
1585   __ pop(rbp);
1586 
1587   __ movptr(c_rarg0, r15_thread);
1588   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1589 
1590   // rax still holds the original exception oop, save it before the call
1591   __ push(rax);
1592 
1593   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1594   __ movptr(rbx, rax);
1595 
1596   // Continue at exception handler:
1597   //   rax: exception oop
1598   //   rbx: exception handler
1599   //   rdx: exception pc
1600   __ pop(rax);
1601   __ verify_oop(rax);
1602   __ pop(rdx);
1603   __ jmp(rbx);
1604 }
1605 
1606 static void gen_continuation_yield(MacroAssembler* masm,
1607                                    const VMRegPair* regs,
1608                                    OopMapSet* oop_maps,
1609                                    int& frame_complete,
1610                                    int& stack_slots,
1611                                    int& compiled_entry_offset) {
1612   enum layout {
1613     rbp_off,
1614     rbpH_off,
1615     return_off,
1616     return_off2,
1617     framesize // inclusive of return address
1618   };
1619   stack_slots = framesize /  VMRegImpl::slots_per_word;
1620   assert(stack_slots == 2, "recheck layout");
1621 
1622   address start = __ pc();
1623   compiled_entry_offset = __ pc() - start;
1624   __ enter();
1625   address the_pc = __ pc();
1626 
1627   frame_complete = the_pc - start;
1628 
1629   // This nop must be exactly at the PC we push into the frame info.
1630   // We use this nop for fast CodeBlob lookup, associate the OopMap
1631   // with it right away.
1632   __ post_call_nop();
1633   OopMap* map = new OopMap(framesize, 1);
1634   oop_maps->add_gc_map(frame_complete, map);
1635 
1636   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1637   __ movptr(c_rarg0, r15_thread);
1638   __ movptr(c_rarg1, rsp);
1639   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1640   __ reset_last_Java_frame(true);
1641 
1642   Label L_pinned;
1643 
1644   __ testptr(rax, rax);
1645   __ jcc(Assembler::notZero, L_pinned);
1646 
1647   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1648   continuation_enter_cleanup(masm);
1649   __ pop(rbp);
1650   __ ret(0);
1651 
1652   __ bind(L_pinned);
1653 
1654   // Pinned, return to caller
1655 
1656   // handle pending exception thrown by freeze
1657   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1658   Label ok;
1659   __ jcc(Assembler::equal, ok);
1660   __ leave();
1661   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1662   __ bind(ok);
1663 
1664   __ leave();
1665   __ ret(0);
1666 }
1667 
1668 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1669   ::continuation_enter_cleanup(masm);
1670 }
1671 
1672 static void gen_special_dispatch(MacroAssembler* masm,
1673                                  const methodHandle& method,
1674                                  const BasicType* sig_bt,
1675                                  const VMRegPair* regs) {
1676   verify_oop_args(masm, method, sig_bt, regs);
1677   vmIntrinsics::ID iid = method->intrinsic_id();
1678 
1679   // Now write the args into the outgoing interpreter space
1680   bool     has_receiver   = false;
1681   Register receiver_reg   = noreg;
1682   int      member_arg_pos = -1;
1683   Register member_reg     = noreg;
1684   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1685   if (ref_kind != 0) {
1686     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1687     member_reg = rbx;  // known to be free at this point
1688     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1689   } else if (iid == vmIntrinsics::_invokeBasic) {
1690     has_receiver = true;
1691   } else if (iid == vmIntrinsics::_linkToNative) {
1692     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1693     member_reg = rbx;  // known to be free at this point
1694   } else {
1695     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1696   }
1697 
1698   if (member_reg != noreg) {
1699     // Load the member_arg into register, if necessary.
1700     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1701     VMReg r = regs[member_arg_pos].first();
1702     if (r->is_stack()) {
1703       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1704     } else {
1705       // no data motion is needed
1706       member_reg = r->as_Register();
1707     }
1708   }
1709 
1710   if (has_receiver) {
1711     // Make sure the receiver is loaded into a register.
1712     assert(method->size_of_parameters() > 0, "oob");
1713     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1714     VMReg r = regs[0].first();
1715     assert(r->is_valid(), "bad receiver arg");
1716     if (r->is_stack()) {
1717       // Porting note:  This assumes that compiled calling conventions always
1718       // pass the receiver oop in a register.  If this is not true on some
1719       // platform, pick a temp and load the receiver from stack.
1720       fatal("receiver always in a register");
1721       receiver_reg = j_rarg0;  // known to be free at this point
1722       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1723     } else {
1724       // no data motion is needed
1725       receiver_reg = r->as_Register();
1726     }
1727   }
1728 
1729   // Figure out which address we are really jumping to:
1730   MethodHandles::generate_method_handle_dispatch(masm, iid,
1731                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1732 }
1733 
1734 // ---------------------------------------------------------------------------
1735 // Generate a native wrapper for a given method.  The method takes arguments
1736 // in the Java compiled code convention, marshals them to the native
1737 // convention (handlizes oops, etc), transitions to native, makes the call,
1738 // returns to java state (possibly blocking), unhandlizes any result and
1739 // returns.
1740 //
1741 // Critical native functions are a shorthand for the use of
1742 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1743 // functions.  The wrapper is expected to unpack the arguments before
1744 // passing them to the callee. Critical native functions leave the state _in_Java,
1745 // since they cannot stop for GC.
1746 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1747 // block and the check for pending exceptions it's impossible for them
1748 // to be thrown.
1749 //
1750 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1751                                                 const methodHandle& method,
1752                                                 int compile_id,
1753                                                 BasicType* in_sig_bt,
1754                                                 VMRegPair* in_regs,
1755                                                 BasicType ret_type) {
1756   if (method->is_continuation_native_intrinsic()) {
1757     int exception_offset = -1;
1758     OopMapSet* oop_maps = new OopMapSet();
1759     int frame_complete = -1;
1760     int stack_slots = -1;
1761     int interpreted_entry_offset = -1;
1762     int vep_offset = -1;
1763     if (method->is_continuation_enter_intrinsic()) {
1764       gen_continuation_enter(masm,
1765                              in_regs,
1766                              exception_offset,
1767                              oop_maps,
1768                              frame_complete,
1769                              stack_slots,
1770                              interpreted_entry_offset,
1771                              vep_offset);
1772     } else if (method->is_continuation_yield_intrinsic()) {
1773       gen_continuation_yield(masm,
1774                              in_regs,
1775                              oop_maps,
1776                              frame_complete,
1777                              stack_slots,
1778                              vep_offset);
1779     } else {
1780       guarantee(false, "Unknown Continuation native intrinsic");
1781     }
1782 
1783 #ifdef ASSERT
1784     if (method->is_continuation_enter_intrinsic()) {
1785       assert(interpreted_entry_offset != -1, "Must be set");
1786       assert(exception_offset != -1,         "Must be set");
1787     } else {
1788       assert(interpreted_entry_offset == -1, "Must be unset");
1789       assert(exception_offset == -1,         "Must be unset");
1790     }
1791     assert(frame_complete != -1,    "Must be set");
1792     assert(stack_slots != -1,       "Must be set");
1793     assert(vep_offset != -1,        "Must be set");
1794 #endif
1795 
1796     __ flush();
1797     nmethod* nm = nmethod::new_native_nmethod(method,
1798                                               compile_id,
1799                                               masm->code(),
1800                                               vep_offset,
1801                                               frame_complete,
1802                                               stack_slots,
1803                                               in_ByteSize(-1),
1804                                               in_ByteSize(-1),
1805                                               oop_maps,
1806                                               exception_offset);
1807     if (nm == nullptr) return nm;
1808     if (method->is_continuation_enter_intrinsic()) {
1809       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1810     } else if (method->is_continuation_yield_intrinsic()) {
1811       ContinuationEntry::set_yield_code(nm);
1812     }
1813     return nm;
1814   }
1815 
1816   if (method->is_method_handle_intrinsic()) {
1817     vmIntrinsics::ID iid = method->intrinsic_id();
1818     intptr_t start = (intptr_t)__ pc();
1819     int vep_offset = ((intptr_t)__ pc()) - start;
1820     gen_special_dispatch(masm,
1821                          method,
1822                          in_sig_bt,
1823                          in_regs);
1824     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1825     __ flush();
1826     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1827     return nmethod::new_native_nmethod(method,
1828                                        compile_id,
1829                                        masm->code(),
1830                                        vep_offset,
1831                                        frame_complete,
1832                                        stack_slots / VMRegImpl::slots_per_word,
1833                                        in_ByteSize(-1),
1834                                        in_ByteSize(-1),
1835                                        nullptr);
1836   }
1837   address native_func = method->native_function();
1838   assert(native_func != nullptr, "must have function");
1839 
1840   // An OopMap for lock (and class if static)
1841   OopMapSet *oop_maps = new OopMapSet();
1842   intptr_t start = (intptr_t)__ pc();
1843 
1844   // We have received a description of where all the java arg are located
1845   // on entry to the wrapper. We need to convert these args to where
1846   // the jni function will expect them. To figure out where they go
1847   // we convert the java signature to a C signature by inserting
1848   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1849 
1850   const int total_in_args = method->size_of_parameters();
1851   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1852 
1853   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1854   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1855 
1856   int argc = 0;
1857   out_sig_bt[argc++] = T_ADDRESS;
1858   if (method->is_static()) {
1859     out_sig_bt[argc++] = T_OBJECT;
1860   }
1861 
1862   for (int i = 0; i < total_in_args ; i++ ) {
1863     out_sig_bt[argc++] = in_sig_bt[i];
1864   }
1865 
1866   // Now figure out where the args must be stored and how much stack space
1867   // they require.
1868   int out_arg_slots;
1869   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1870 
1871   // Compute framesize for the wrapper.  We need to handlize all oops in
1872   // incoming registers
1873 
1874   // Calculate the total number of stack slots we will need.
1875 
1876   // First count the abi requirement plus all of the outgoing args
1877   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1878 
1879   // Now the space for the inbound oop handle area
1880   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1881 
1882   int oop_handle_offset = stack_slots;
1883   stack_slots += total_save_slots;
1884 
1885   // Now any space we need for handlizing a klass if static method
1886 
1887   int klass_slot_offset = 0;
1888   int klass_offset = -1;
1889   int lock_slot_offset = 0;
1890   bool is_static = false;
1891 
1892   if (method->is_static()) {
1893     klass_slot_offset = stack_slots;
1894     stack_slots += VMRegImpl::slots_per_word;
1895     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1896     is_static = true;
1897   }
1898 
1899   // Plus a lock if needed
1900 
1901   if (method->is_synchronized()) {
1902     lock_slot_offset = stack_slots;
1903     stack_slots += VMRegImpl::slots_per_word;
1904   }
1905 
1906   // Now a place (+2) to save return values or temp during shuffling
1907   // + 4 for return address (which we own) and saved rbp
1908   stack_slots += 6;
1909 
1910   // Ok The space we have allocated will look like:
1911   //
1912   //
1913   // FP-> |                     |
1914   //      |---------------------|
1915   //      | 2 slots for moves   |
1916   //      |---------------------|
1917   //      | lock box (if sync)  |
1918   //      |---------------------| <- lock_slot_offset
1919   //      | klass (if static)   |
1920   //      |---------------------| <- klass_slot_offset
1921   //      | oopHandle area      |
1922   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1923   //      | outbound memory     |
1924   //      | based arguments     |
1925   //      |                     |
1926   //      |---------------------|
1927   //      |                     |
1928   // SP-> | out_preserved_slots |
1929   //
1930   //
1931 
1932 
1933   // Now compute actual number of stack words we need rounding to make
1934   // stack properly aligned.
1935   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1936 
1937   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1938 
1939   // First thing make an ic check to see if we should even be here
1940 
1941   // We are free to use all registers as temps without saving them and
1942   // restoring them except rbp. rbp is the only callee save register
1943   // as far as the interpreter and the compiler(s) are concerned.
1944 
1945   const Register receiver = j_rarg0;
1946 
1947   Label exception_pending;
1948 
1949   assert_different_registers(receiver, rscratch1, rscratch2);
1950   __ verify_oop(receiver);
1951   __ ic_check(8 /* end_alignment */);
1952 
1953   int vep_offset = ((intptr_t)__ pc()) - start;
1954 
1955   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1956     Label L_skip_barrier;
1957     Register klass = r10;
1958     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1959     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1960 
1961     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1962 
1963     __ bind(L_skip_barrier);
1964   }
1965 
1966 #ifdef COMPILER1
1967   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1968   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1969     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1970   }
1971 #endif // COMPILER1
1972 
1973   // The instruction at the verified entry point must be 5 bytes or longer
1974   // because it can be patched on the fly by make_non_entrant. The stack bang
1975   // instruction fits that requirement.
1976 
1977   // Generate stack overflow check
1978   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1979 
1980   // Generate a new frame for the wrapper.
1981   __ enter();
1982   // -2 because return address is already present and so is saved rbp
1983   __ subptr(rsp, stack_size - 2*wordSize);
1984 
1985   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1986   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1987   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1988 
1989   // Frame is now completed as far as size and linkage.
1990   int frame_complete = ((intptr_t)__ pc()) - start;
1991 
1992 #ifdef ASSERT
1993   __ check_stack_alignment(rsp, "improperly aligned stack");
1994 #endif /* ASSERT */
1995 
1996 
1997   // We use r14 as the oop handle for the receiver/klass
1998   // It is callee save so it survives the call to native
1999 
2000   const Register oop_handle_reg = r14;
2001 
2002   //
2003   // We immediately shuffle the arguments so that any vm call we have to
2004   // make from here on out (sync slow path, jvmti, etc.) we will have
2005   // captured the oops from our caller and have a valid oopMap for
2006   // them.
2007 
2008   // -----------------
2009   // The Grand Shuffle
2010 
2011   // The Java calling convention is either equal (linux) or denser (win64) than the
2012   // c calling convention. However the because of the jni_env argument the c calling
2013   // convention always has at least one more (and two for static) arguments than Java.
2014   // Therefore if we move the args from java -> c backwards then we will never have
2015   // a register->register conflict and we don't have to build a dependency graph
2016   // and figure out how to break any cycles.
2017   //
2018 
2019   // Record esp-based slot for receiver on stack for non-static methods
2020   int receiver_offset = -1;
2021 
2022   // This is a trick. We double the stack slots so we can claim
2023   // the oops in the caller's frame. Since we are sure to have
2024   // more args than the caller doubling is enough to make
2025   // sure we can capture all the incoming oop args from the
2026   // caller.
2027   //
2028   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2029 
2030   // Mark location of rbp (someday)
2031   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2032 
2033   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2034   // All inbound args are referenced based on rbp and all outbound args via rsp.
2035 
2036 
2037 #ifdef ASSERT
2038   bool reg_destroyed[Register::number_of_registers];
2039   bool freg_destroyed[XMMRegister::number_of_registers];
2040   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2041     reg_destroyed[r] = false;
2042   }
2043   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2044     freg_destroyed[f] = false;
2045   }
2046 
2047 #endif /* ASSERT */
2048 
2049   // For JNI natives the incoming and outgoing registers are offset upwards.
2050   GrowableArray<int> arg_order(2 * total_in_args);
2051 
2052   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2053     arg_order.push(i);
2054     arg_order.push(c_arg);
2055   }
2056 
2057   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2058     int i = arg_order.at(ai);
2059     int c_arg = arg_order.at(ai + 1);
2060     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2061 #ifdef ASSERT
2062     if (in_regs[i].first()->is_Register()) {
2063       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2064     } else if (in_regs[i].first()->is_XMMRegister()) {
2065       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2066     }
2067     if (out_regs[c_arg].first()->is_Register()) {
2068       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2069     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2070       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2071     }
2072 #endif /* ASSERT */
2073     switch (in_sig_bt[i]) {
2074       case T_ARRAY:
2075       case T_OBJECT:
2076         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2077                     ((i == 0) && (!is_static)),
2078                     &receiver_offset);
2079         break;
2080       case T_VOID:
2081         break;
2082 
2083       case T_FLOAT:
2084         __ float_move(in_regs[i], out_regs[c_arg]);
2085           break;
2086 
2087       case T_DOUBLE:
2088         assert( i + 1 < total_in_args &&
2089                 in_sig_bt[i + 1] == T_VOID &&
2090                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2091         __ double_move(in_regs[i], out_regs[c_arg]);
2092         break;
2093 
2094       case T_LONG :
2095         __ long_move(in_regs[i], out_regs[c_arg]);
2096         break;
2097 
2098       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2099 
2100       default:
2101         __ move32_64(in_regs[i], out_regs[c_arg]);
2102     }
2103   }
2104 
2105   int c_arg;
2106 
2107   // Pre-load a static method's oop into r14.  Used both by locking code and
2108   // the normal JNI call code.
2109   // point c_arg at the first arg that is already loaded in case we
2110   // need to spill before we call out
2111   c_arg = total_c_args - total_in_args;
2112 
2113   if (method->is_static()) {
2114 
2115     //  load oop into a register
2116     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2117 
2118     // Now handlize the static class mirror it's known not-null.
2119     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2120     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2121 
2122     // Now get the handle
2123     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2124     // store the klass handle as second argument
2125     __ movptr(c_rarg1, oop_handle_reg);
2126     // and protect the arg if we must spill
2127     c_arg--;
2128   }
2129 
2130   // Change state to native (we save the return address in the thread, since it might not
2131   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2132   // points into the right code segment. It does not have to be the correct return pc.
2133   // We use the same pc/oopMap repeatedly when we call out
2134 
2135   Label native_return;
2136   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2137     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2138     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2139   } else {
2140     intptr_t the_pc = (intptr_t) __ pc();
2141     oop_maps->add_gc_map(the_pc - start, map);
2142 
2143     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2144   }
2145 
2146   // We have all of the arguments setup at this point. We must not touch any register
2147   // argument registers at this point (what if we save/restore them there are no oop?
2148 
2149   if (DTraceMethodProbes) {
2150     // protect the args we've loaded
2151     save_args(masm, total_c_args, c_arg, out_regs);
2152     __ mov_metadata(c_rarg1, method());
2153     __ call_VM_leaf(
2154       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2155       r15_thread, c_rarg1);
2156     restore_args(masm, total_c_args, c_arg, out_regs);
2157   }
2158 
2159   // RedefineClasses() tracing support for obsolete method entry
2160   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2161     // protect the args we've loaded
2162     save_args(masm, total_c_args, c_arg, out_regs);
2163     __ mov_metadata(c_rarg1, method());
2164     __ call_VM_leaf(
2165       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2166       r15_thread, c_rarg1);
2167     restore_args(masm, total_c_args, c_arg, out_regs);
2168   }
2169 
2170   // Lock a synchronized method
2171 
2172   // Register definitions used by locking and unlocking
2173 
2174   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2175   const Register obj_reg  = rbx;  // Will contain the oop
2176   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2177   const Register old_hdr  = r13;  // value of old header at unlock time
2178 
2179   Label slow_path_lock;
2180   Label lock_done;
2181 
2182   if (method->is_synchronized()) {
2183     Label count_mon;
2184 
2185     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2186 
2187     // Get the handle (the 2nd argument)
2188     __ mov(oop_handle_reg, c_rarg1);
2189 
2190     // Get address of the box
2191 
2192     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2193 
2194     // Load the oop from the handle
2195     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2196 
2197     if (LockingMode == LM_MONITOR) {
2198       __ jmp(slow_path_lock);
2199     } else if (LockingMode == LM_LEGACY) {
2200       // Load immediate 1 into swap_reg %rax
2201       __ movl(swap_reg, 1);
2202 
2203       // Load (object->mark() | 1) into swap_reg %rax
2204       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2205 
2206       // Save (object->mark() | 1) into BasicLock's displaced header
2207       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2208 
2209       // src -> dest iff dest == rax else rax <- dest
2210       __ lock();
2211       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2212       __ jcc(Assembler::equal, count_mon);
2213 
2214       // Hmm should this move to the slow path code area???
2215 
2216       // Test if the oopMark is an obvious stack pointer, i.e.,
2217       //  1) (mark & 3) == 0, and
2218       //  2) rsp <= mark < mark + os::pagesize()
2219       // These 3 tests can be done by evaluating the following
2220       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2221       // assuming both stack pointer and pagesize have their
2222       // least significant 2 bits clear.
2223       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2224 
2225       __ subptr(swap_reg, rsp);
2226       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2227 
2228       // Save the test result, for recursive case, the result is zero
2229       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2230       __ jcc(Assembler::notEqual, slow_path_lock);
2231 
2232       __ bind(count_mon);
2233       __ inc_held_monitor_count();
2234     } else {
2235       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2236       __ lightweight_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2237     }
2238 
2239     // Slow path will re-enter here
2240     __ bind(lock_done);
2241   }
2242 
2243   // Finally just about ready to make the JNI call
2244 
2245   // get JNIEnv* which is first argument to native
2246   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2247 
2248   // Now set thread in native
2249   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2250 
2251   __ call(RuntimeAddress(native_func));
2252 
2253   // Verify or restore cpu control state after JNI call
2254   __ restore_cpu_control_state_after_jni(rscratch1);
2255 
2256   // Unpack native results.
2257   switch (ret_type) {
2258   case T_BOOLEAN: __ c2bool(rax);            break;
2259   case T_CHAR   : __ movzwl(rax, rax);      break;
2260   case T_BYTE   : __ sign_extend_byte (rax); break;
2261   case T_SHORT  : __ sign_extend_short(rax); break;
2262   case T_INT    : /* nothing to do */        break;
2263   case T_DOUBLE :
2264   case T_FLOAT  :
2265     // Result is in xmm0 we'll save as needed
2266     break;
2267   case T_ARRAY:                 // Really a handle
2268   case T_OBJECT:                // Really a handle
2269       break; // can't de-handlize until after safepoint check
2270   case T_VOID: break;
2271   case T_LONG: break;
2272   default       : ShouldNotReachHere();
2273   }
2274 
2275   // Switch thread to "native transition" state before reading the synchronization state.
2276   // This additional state is necessary because reading and testing the synchronization
2277   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2278   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2279   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2280   //     Thread A is resumed to finish this native method, but doesn't block here since it
2281   //     didn't see any synchronization is progress, and escapes.
2282   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2283 
2284   // Force this write out before the read below
2285   if (!UseSystemMemoryBarrier) {
2286     __ membar(Assembler::Membar_mask_bits(
2287               Assembler::LoadLoad | Assembler::LoadStore |
2288               Assembler::StoreLoad | Assembler::StoreStore));
2289   }
2290 
2291   // check for safepoint operation in progress and/or pending suspend requests
2292   {
2293     Label Continue;
2294     Label slow_path;
2295 
2296     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2297 
2298     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2299     __ jcc(Assembler::equal, Continue);
2300     __ bind(slow_path);
2301 
2302     // Don't use call_VM as it will see a possible pending exception and forward it
2303     // and never return here preventing us from clearing _last_native_pc down below.
2304     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2305     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2306     // by hand.
2307     //
2308     __ vzeroupper();
2309     save_native_result(masm, ret_type, stack_slots);
2310     __ mov(c_rarg0, r15_thread);
2311     __ mov(r12, rsp); // remember sp
2312     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2313     __ andptr(rsp, -16); // align stack as required by ABI
2314     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2315     __ mov(rsp, r12); // restore sp
2316     __ reinit_heapbase();
2317     // Restore any method result value
2318     restore_native_result(masm, ret_type, stack_slots);
2319     __ bind(Continue);
2320   }
2321 
2322   // change thread state
2323   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2324 
2325   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2326     // Check preemption for Object.wait()
2327     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2328     __ cmpptr(rscratch1, NULL_WORD);
2329     __ jccb(Assembler::equal, native_return);
2330     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2331     __ jmp(rscratch1);
2332     __ bind(native_return);
2333 
2334     intptr_t the_pc = (intptr_t) __ pc();
2335     oop_maps->add_gc_map(the_pc - start, map);
2336   }
2337 
2338 
2339   Label reguard;
2340   Label reguard_done;
2341   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2342   __ jcc(Assembler::equal, reguard);
2343   __ bind(reguard_done);
2344 
2345   // native result if any is live
2346 
2347   // Unlock
2348   Label slow_path_unlock;
2349   Label unlock_done;
2350   if (method->is_synchronized()) {
2351 
2352     Label fast_done;
2353 
2354     // Get locked oop from the handle we passed to jni
2355     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2356 
2357     if (LockingMode == LM_LEGACY) {
2358       Label not_recur;
2359       // Simple recursive lock?
2360       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2361       __ jcc(Assembler::notEqual, not_recur);
2362       __ dec_held_monitor_count();
2363       __ jmpb(fast_done);
2364       __ bind(not_recur);
2365     }
2366 
2367     // Must save rax if it is live now because cmpxchg must use it
2368     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2369       save_native_result(masm, ret_type, stack_slots);
2370     }
2371 
2372     if (LockingMode == LM_MONITOR) {
2373       __ jmp(slow_path_unlock);
2374     } else if (LockingMode == LM_LEGACY) {
2375       // get address of the stack lock
2376       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2377       //  get old displaced header
2378       __ movptr(old_hdr, Address(rax, 0));
2379 
2380       // Atomic swap old header if oop still contains the stack lock
2381       __ lock();
2382       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2383       __ jcc(Assembler::notEqual, slow_path_unlock);
2384       __ dec_held_monitor_count();
2385     } else {
2386       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2387       __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2388     }
2389 
2390     // slow path re-enters here
2391     __ bind(unlock_done);
2392     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2393       restore_native_result(masm, ret_type, stack_slots);
2394     }
2395 
2396     __ bind(fast_done);
2397   }
2398   if (DTraceMethodProbes) {
2399     save_native_result(masm, ret_type, stack_slots);
2400     __ mov_metadata(c_rarg1, method());
2401     __ call_VM_leaf(
2402          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2403          r15_thread, c_rarg1);
2404     restore_native_result(masm, ret_type, stack_slots);
2405   }
2406 
2407   __ reset_last_Java_frame(false);
2408 
2409   // Unbox oop result, e.g. JNIHandles::resolve value.
2410   if (is_reference_type(ret_type)) {
2411     __ resolve_jobject(rax /* value */,
2412                        rcx /* tmp */);
2413   }
2414 
2415   if (CheckJNICalls) {
2416     // clear_pending_jni_exception_check
2417     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2418   }
2419 
2420   // reset handle block
2421   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2422   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2423 
2424   // pop our frame
2425 
2426   __ leave();
2427 
2428 #if INCLUDE_JFR
2429   // We need to do a poll test after unwind in case the sampler
2430   // managed to sample the native frame after returning to Java.
2431   Label L_return;
2432   address poll_test_pc = __ pc();
2433   __ relocate(relocInfo::poll_return_type);
2434   __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2435   __ jccb(Assembler::zero, L_return);
2436   __ lea(rscratch1, InternalAddress(poll_test_pc));
2437   __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2438   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2439     "polling page return stub not created yet");
2440   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2441   __ jump(RuntimeAddress(stub));
2442   __ bind(L_return);
2443 #endif // INCLUDE_JFR
2444 
2445   // Any exception pending?
2446   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2447   __ jcc(Assembler::notEqual, exception_pending);
2448 
2449   // Return
2450 
2451   __ ret(0);
2452 
2453   // Unexpected paths are out of line and go here
2454 
2455   // forward the exception
2456   __ bind(exception_pending);
2457 
2458   // and forward the exception
2459   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2460 
2461   // Slow path locking & unlocking
2462   if (method->is_synchronized()) {
2463 
2464     // BEGIN Slow path lock
2465     __ bind(slow_path_lock);
2466 
2467     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2468     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2469 
2470     // protect the args we've loaded
2471     save_args(masm, total_c_args, c_arg, out_regs);
2472 
2473     __ mov(c_rarg0, obj_reg);
2474     __ mov(c_rarg1, lock_reg);
2475     __ mov(c_rarg2, r15_thread);
2476 
2477     // Not a leaf but we have last_Java_frame setup as we want.
2478     // We don't want to unmount in case of contention since that would complicate preserving
2479     // the arguments that had already been marshalled into the native convention. So we force
2480     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2481     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2482     __ push_cont_fastpath();
2483     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2484     __ pop_cont_fastpath();
2485     restore_args(masm, total_c_args, c_arg, out_regs);
2486 
2487 #ifdef ASSERT
2488     { Label L;
2489     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2490     __ jcc(Assembler::equal, L);
2491     __ stop("no pending exception allowed on exit from monitorenter");
2492     __ bind(L);
2493     }
2494 #endif
2495     __ jmp(lock_done);
2496 
2497     // END Slow path lock
2498 
2499     // BEGIN Slow path unlock
2500     __ bind(slow_path_unlock);
2501 
2502     // If we haven't already saved the native result we must save it now as xmm registers
2503     // are still exposed.
2504     __ vzeroupper();
2505     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2506       save_native_result(masm, ret_type, stack_slots);
2507     }
2508 
2509     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2510 
2511     __ mov(c_rarg0, obj_reg);
2512     __ mov(c_rarg2, r15_thread);
2513     __ mov(r12, rsp); // remember sp
2514     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2515     __ andptr(rsp, -16); // align stack as required by ABI
2516 
2517     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2518     // NOTE that obj_reg == rbx currently
2519     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2520     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2521 
2522     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2523     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2524     __ mov(rsp, r12); // restore sp
2525     __ reinit_heapbase();
2526 #ifdef ASSERT
2527     {
2528       Label L;
2529       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2530       __ jcc(Assembler::equal, L);
2531       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2532       __ bind(L);
2533     }
2534 #endif /* ASSERT */
2535 
2536     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2537 
2538     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2539       restore_native_result(masm, ret_type, stack_slots);
2540     }
2541     __ jmp(unlock_done);
2542 
2543     // END Slow path unlock
2544 
2545   } // synchronized
2546 
2547   // SLOW PATH Reguard the stack if needed
2548 
2549   __ bind(reguard);
2550   __ vzeroupper();
2551   save_native_result(masm, ret_type, stack_slots);
2552   __ mov(r12, rsp); // remember sp
2553   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2554   __ andptr(rsp, -16); // align stack as required by ABI
2555   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2556   __ mov(rsp, r12); // restore sp
2557   __ reinit_heapbase();
2558   restore_native_result(masm, ret_type, stack_slots);
2559   // and continue
2560   __ jmp(reguard_done);
2561 
2562 
2563 
2564   __ flush();
2565 
2566   nmethod *nm = nmethod::new_native_nmethod(method,
2567                                             compile_id,
2568                                             masm->code(),
2569                                             vep_offset,
2570                                             frame_complete,
2571                                             stack_slots / VMRegImpl::slots_per_word,
2572                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2573                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2574                                             oop_maps);
2575 
2576   return nm;
2577 }
2578 
2579 // this function returns the adjust size (in number of words) to a c2i adapter
2580 // activation for use during deoptimization
2581 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2582   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2583 }
2584 
2585 
2586 uint SharedRuntime::out_preserve_stack_slots() {
2587   return 0;
2588 }
2589 
2590 
2591 // Number of stack slots between incoming argument block and the start of
2592 // a new frame.  The PROLOG must add this many slots to the stack.  The
2593 // EPILOG must remove this many slots.  amd64 needs two slots for
2594 // return address.
2595 uint SharedRuntime::in_preserve_stack_slots() {
2596   return 4 + 2 * VerifyStackAtCalls;
2597 }
2598 
2599 VMReg SharedRuntime::thread_register() {
2600   return r15_thread->as_VMReg();
2601 }
2602 
2603 //------------------------------generate_deopt_blob----------------------------
2604 void SharedRuntime::generate_deopt_blob() {
2605   // Allocate space for the code
2606   ResourceMark rm;
2607   // Setup code generation tools
2608   int pad = 0;
2609   if (UseAVX > 2) {
2610     pad += 1024;
2611   }
2612   if (UseAPX) {
2613     pad += 1024;
2614   }
2615 #if INCLUDE_JVMCI
2616   if (EnableJVMCI) {
2617     pad += 512; // Increase the buffer size when compiling for JVMCI
2618   }
2619 #endif
2620   const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id);
2621   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)SharedStubId::deopt_id, name);
2622   if (blob != nullptr) {
2623     _deopt_blob = blob->as_deoptimization_blob();
2624     return;
2625   }
2626 
2627   CodeBuffer buffer(name, 2560+pad, 1024);
2628   MacroAssembler* masm = new MacroAssembler(&buffer);
2629   int frame_size_in_words;
2630   OopMap* map = nullptr;
2631   OopMapSet *oop_maps = new OopMapSet();
2632 
2633   // -------------
2634   // This code enters when returning to a de-optimized nmethod.  A return
2635   // address has been pushed on the stack, and return values are in
2636   // registers.
2637   // If we are doing a normal deopt then we were called from the patched
2638   // nmethod from the point we returned to the nmethod. So the return
2639   // address on the stack is wrong by NativeCall::instruction_size
2640   // We will adjust the value so it looks like we have the original return
2641   // address on the stack (like when we eagerly deoptimized).
2642   // In the case of an exception pending when deoptimizing, we enter
2643   // with a return address on the stack that points after the call we patched
2644   // into the exception handler. We have the following register state from,
2645   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2646   //    rax: exception oop
2647   //    rbx: exception handler
2648   //    rdx: throwing pc
2649   // So in this case we simply jam rdx into the useless return address and
2650   // the stack looks just like we want.
2651   //
2652   // At this point we need to de-opt.  We save the argument return
2653   // registers.  We call the first C routine, fetch_unroll_info().  This
2654   // routine captures the return values and returns a structure which
2655   // describes the current frame size and the sizes of all replacement frames.
2656   // The current frame is compiled code and may contain many inlined
2657   // functions, each with their own JVM state.  We pop the current frame, then
2658   // push all the new frames.  Then we call the C routine unpack_frames() to
2659   // populate these frames.  Finally unpack_frames() returns us the new target
2660   // address.  Notice that callee-save registers are BLOWN here; they have
2661   // already been captured in the vframeArray at the time the return PC was
2662   // patched.
2663   address start = __ pc();
2664   Label cont;
2665 
2666   // Prolog for non exception case!
2667 
2668   // Save everything in sight.
2669   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2670 
2671   // Normal deoptimization.  Save exec mode for unpack_frames.
2672   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2673   __ jmp(cont);
2674 
2675   int reexecute_offset = __ pc() - start;
2676 #if INCLUDE_JVMCI && !defined(COMPILER1)
2677   if (UseJVMCICompiler) {
2678     // JVMCI does not use this kind of deoptimization
2679     __ should_not_reach_here();
2680   }
2681 #endif
2682 
2683   // Reexecute case
2684   // return address is the pc describes what bci to do re-execute at
2685 
2686   // No need to update map as each call to save_live_registers will produce identical oopmap
2687   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2688 
2689   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2690   __ jmp(cont);
2691 
2692 #if INCLUDE_JVMCI
2693   Label after_fetch_unroll_info_call;
2694   int implicit_exception_uncommon_trap_offset = 0;
2695   int uncommon_trap_offset = 0;
2696 
2697   if (EnableJVMCI) {
2698     implicit_exception_uncommon_trap_offset = __ pc() - start;
2699 
2700     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2701     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2702 
2703     uncommon_trap_offset = __ pc() - start;
2704 
2705     // Save everything in sight.
2706     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2707     // fetch_unroll_info needs to call last_java_frame()
2708     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2709 
2710     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2711     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2712 
2713     __ movl(r14, Deoptimization::Unpack_reexecute);
2714     __ mov(c_rarg0, r15_thread);
2715     __ movl(c_rarg2, r14); // exec mode
2716     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2717     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2718 
2719     __ reset_last_Java_frame(false);
2720 
2721     __ jmp(after_fetch_unroll_info_call);
2722   } // EnableJVMCI
2723 #endif // INCLUDE_JVMCI
2724 
2725   int exception_offset = __ pc() - start;
2726 
2727   // Prolog for exception case
2728 
2729   // all registers are dead at this entry point, except for rax, and
2730   // rdx which contain the exception oop and exception pc
2731   // respectively.  Set them in TLS and fall thru to the
2732   // unpack_with_exception_in_tls entry point.
2733 
2734   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2735   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2736 
2737   int exception_in_tls_offset = __ pc() - start;
2738 
2739   // new implementation because exception oop is now passed in JavaThread
2740 
2741   // Prolog for exception case
2742   // All registers must be preserved because they might be used by LinearScan
2743   // Exceptiop oop and throwing PC are passed in JavaThread
2744   // tos: stack at point of call to method that threw the exception (i.e. only
2745   // args are on the stack, no return address)
2746 
2747   // make room on stack for the return address
2748   // It will be patched later with the throwing pc. The correct value is not
2749   // available now because loading it from memory would destroy registers.
2750   __ push(0);
2751 
2752   // Save everything in sight.
2753   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2754 
2755   // Now it is safe to overwrite any register
2756 
2757   // Deopt during an exception.  Save exec mode for unpack_frames.
2758   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2759 
2760   // load throwing pc from JavaThread and patch it as the return address
2761   // of the current frame. Then clear the field in JavaThread
2762 
2763   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2764   __ movptr(Address(rbp, wordSize), rdx);
2765   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2766 
2767 #ifdef ASSERT
2768   // verify that there is really an exception oop in JavaThread
2769   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2770   __ verify_oop(rax);
2771 
2772   // verify that there is no pending exception
2773   Label no_pending_exception;
2774   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2775   __ testptr(rax, rax);
2776   __ jcc(Assembler::zero, no_pending_exception);
2777   __ stop("must not have pending exception here");
2778   __ bind(no_pending_exception);
2779 #endif
2780 
2781   __ bind(cont);
2782 
2783   // Call C code.  Need thread and this frame, but NOT official VM entry
2784   // crud.  We cannot block on this call, no GC can happen.
2785   //
2786   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2787 
2788   // fetch_unroll_info needs to call last_java_frame().
2789 
2790   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2791 #ifdef ASSERT
2792   { Label L;
2793     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2794     __ jcc(Assembler::equal, L);
2795     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2796     __ bind(L);
2797   }
2798 #endif // ASSERT
2799   __ mov(c_rarg0, r15_thread);
2800   __ movl(c_rarg1, r14); // exec_mode
2801   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2802 
2803   // Need to have an oopmap that tells fetch_unroll_info where to
2804   // find any register it might need.
2805   oop_maps->add_gc_map(__ pc() - start, map);
2806 
2807   __ reset_last_Java_frame(false);
2808 
2809 #if INCLUDE_JVMCI
2810   if (EnableJVMCI) {
2811     __ bind(after_fetch_unroll_info_call);
2812   }
2813 #endif
2814 
2815   // Load UnrollBlock* into rdi
2816   __ mov(rdi, rax);
2817 
2818   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2819    Label noException;
2820   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2821   __ jcc(Assembler::notEqual, noException);
2822   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2823   // QQQ this is useless it was null above
2824   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2825   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2826   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2827 
2828   __ verify_oop(rax);
2829 
2830   // Overwrite the result registers with the exception results.
2831   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2832   // I think this is useless
2833   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2834 
2835   __ bind(noException);
2836 
2837   // Only register save data is on the stack.
2838   // Now restore the result registers.  Everything else is either dead
2839   // or captured in the vframeArray.
2840   RegisterSaver::restore_result_registers(masm);
2841 
2842   // All of the register save area has been popped of the stack. Only the
2843   // return address remains.
2844 
2845   // Pop all the frames we must move/replace.
2846   //
2847   // Frame picture (youngest to oldest)
2848   // 1: self-frame (no frame link)
2849   // 2: deopting frame  (no frame link)
2850   // 3: caller of deopting frame (could be compiled/interpreted).
2851   //
2852   // Note: by leaving the return address of self-frame on the stack
2853   // and using the size of frame 2 to adjust the stack
2854   // when we are done the return to frame 3 will still be on the stack.
2855 
2856   // Pop deoptimized frame
2857   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2858   __ addptr(rsp, rcx);
2859 
2860   // rsp should be pointing at the return address to the caller (3)
2861 
2862   // Pick up the initial fp we should save
2863   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2864   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2865 
2866 #ifdef ASSERT
2867   // Compilers generate code that bang the stack by as much as the
2868   // interpreter would need. So this stack banging should never
2869   // trigger a fault. Verify that it does not on non product builds.
2870   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2871   __ bang_stack_size(rbx, rcx);
2872 #endif
2873 
2874   // Load address of array of frame pcs into rcx
2875   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2876 
2877   // Trash the old pc
2878   __ addptr(rsp, wordSize);
2879 
2880   // Load address of array of frame sizes into rsi
2881   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2882 
2883   // Load counter into rdx
2884   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2885 
2886   // Now adjust the caller's stack to make up for the extra locals
2887   // but record the original sp so that we can save it in the skeletal interpreter
2888   // frame and the stack walking of interpreter_sender will get the unextended sp
2889   // value and not the "real" sp value.
2890 
2891   const Register sender_sp = r8;
2892 
2893   __ mov(sender_sp, rsp);
2894   __ movl(rbx, Address(rdi,
2895                        Deoptimization::UnrollBlock::
2896                        caller_adjustment_offset()));
2897   __ subptr(rsp, rbx);
2898 
2899   // Push interpreter frames in a loop
2900   Label loop;
2901   __ bind(loop);
2902   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2903   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2904   __ pushptr(Address(rcx, 0));          // Save return address
2905   __ enter();                           // Save old & set new ebp
2906   __ subptr(rsp, rbx);                  // Prolog
2907   // This value is corrected by layout_activation_impl
2908   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2909   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2910   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2911   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2912   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2913   __ decrementl(rdx);                   // Decrement counter
2914   __ jcc(Assembler::notZero, loop);
2915   __ pushptr(Address(rcx, 0));          // Save final return address
2916 
2917   // Re-push self-frame
2918   __ enter();                           // Save old & set new ebp
2919 
2920   // Allocate a full sized register save area.
2921   // Return address and rbp are in place, so we allocate two less words.
2922   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2923 
2924   // Restore frame locals after moving the frame
2925   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2926   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2927 
2928   // Call C code.  Need thread but NOT official VM entry
2929   // crud.  We cannot block on this call, no GC can happen.  Call should
2930   // restore return values to their stack-slots with the new SP.
2931   //
2932   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2933 
2934   // Use rbp because the frames look interpreted now
2935   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2936   // Don't need the precise return PC here, just precise enough to point into this code blob.
2937   address the_pc = __ pc();
2938   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2939 
2940   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2941   __ mov(c_rarg0, r15_thread);
2942   __ movl(c_rarg1, r14); // second arg: exec_mode
2943   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2944   // Revert SP alignment after call since we're going to do some SP relative addressing below
2945   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2946 
2947   // Set an oopmap for the call site
2948   // Use the same PC we used for the last java frame
2949   oop_maps->add_gc_map(the_pc - start,
2950                        new OopMap( frame_size_in_words, 0 ));
2951 
2952   // Clear fp AND pc
2953   __ reset_last_Java_frame(true);
2954 
2955   // Collect return values
2956   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2957   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2958   // I think this is useless (throwing pc?)
2959   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2960 
2961   // Pop self-frame.
2962   __ leave();                           // Epilog
2963 
2964   // Jump to interpreter
2965   __ ret(0);
2966 
2967   // Make sure all code is generated
2968   masm->flush();
2969 
2970   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2971   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2972 #if INCLUDE_JVMCI
2973   if (EnableJVMCI) {
2974     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2975     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2976   }
2977 #endif
2978 
2979   AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, (uint)SharedStubId::deopt_id, name);
2980 }
2981 
2982 //------------------------------generate_handler_blob------
2983 //
2984 // Generate a special Compile2Runtime blob that saves all registers,
2985 // and setup oopmap.
2986 //
2987 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) {
2988   assert(StubRoutines::forward_exception_entry() != nullptr,
2989          "must be generated before");
2990   assert(is_polling_page_id(id), "expected a polling page stub id");
2991 
2992   // Allocate space for the code.  Setup code generation tools.
2993   const char* name = SharedRuntime::stub_name(id);
2994   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)id, name);
2995   if (blob != nullptr) {
2996     return blob->as_safepoint_blob();
2997   }
2998 
2999   ResourceMark rm;
3000   OopMapSet *oop_maps = new OopMapSet();
3001   OopMap* map;
3002   CodeBuffer buffer(name, 2548, 1024);
3003   MacroAssembler* masm = new MacroAssembler(&buffer);
3004 
3005   address start   = __ pc();
3006   address call_pc = nullptr;
3007   int frame_size_in_words;
3008   bool cause_return = (id == SharedStubId::polling_page_return_handler_id);
3009   bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id);
3010 
3011   // Make room for return address (or push it again)
3012   if (!cause_return) {
3013     __ push(rbx);
3014   }
3015 
3016   // Save registers, fpu state, and flags
3017   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3018 
3019   // The following is basically a call_VM.  However, we need the precise
3020   // address of the call in order to generate an oopmap. Hence, we do all the
3021   // work ourselves.
3022 
3023   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3024 
3025   // The return address must always be correct so that frame constructor never
3026   // sees an invalid pc.
3027 
3028   if (!cause_return) {
3029     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3030     // Additionally, rbx is a callee saved register and we can look at it later to determine
3031     // if someone changed the return address for us!
3032     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3033     __ movptr(Address(rbp, wordSize), rbx);
3034   }
3035 
3036   // Do the call
3037   __ mov(c_rarg0, r15_thread);
3038   __ call(RuntimeAddress(call_ptr));
3039 
3040   // Set an oopmap for the call site.  This oopmap will map all
3041   // oop-registers and debug-info registers as callee-saved.  This
3042   // will allow deoptimization at this safepoint to find all possible
3043   // debug-info recordings, as well as let GC find all oops.
3044 
3045   oop_maps->add_gc_map( __ pc() - start, map);
3046 
3047   Label noException;
3048 
3049   __ reset_last_Java_frame(false);
3050 
3051   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3052   __ jcc(Assembler::equal, noException);
3053 
3054   // Exception pending
3055 
3056   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3057 
3058   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3059 
3060   // No exception case
3061   __ bind(noException);
3062 
3063   Label no_adjust;
3064 #ifdef ASSERT
3065   Label bail;
3066 #endif
3067   if (!cause_return) {
3068     Label no_prefix, not_special, check_rex_prefix;
3069 
3070     // If our stashed return pc was modified by the runtime we avoid touching it
3071     __ cmpptr(rbx, Address(rbp, wordSize));
3072     __ jcc(Assembler::notEqual, no_adjust);
3073 
3074     // Skip over the poll instruction.
3075     // See NativeInstruction::is_safepoint_poll()
3076     // Possible encodings:
3077     //      85 00       test   %eax,(%rax)
3078     //      85 01       test   %eax,(%rcx)
3079     //      85 02       test   %eax,(%rdx)
3080     //      85 03       test   %eax,(%rbx)
3081     //      85 06       test   %eax,(%rsi)
3082     //      85 07       test   %eax,(%rdi)
3083     //
3084     //   41 85 00       test   %eax,(%r8)
3085     //   41 85 01       test   %eax,(%r9)
3086     //   41 85 02       test   %eax,(%r10)
3087     //   41 85 03       test   %eax,(%r11)
3088     //   41 85 06       test   %eax,(%r14)
3089     //   41 85 07       test   %eax,(%r15)
3090     //
3091     //      85 04 24    test   %eax,(%rsp)
3092     //   41 85 04 24    test   %eax,(%r12)
3093     //      85 45 00    test   %eax,0x0(%rbp)
3094     //   41 85 45 00    test   %eax,0x0(%r13)
3095     //
3096     // Notes:
3097     //  Format of legacy MAP0 test instruction:-
3098     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3099     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3100     //     operand and base register of memory operand is b/w [0-8), hence we do not require
3101     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3102     //     is why two bytes encoding is sufficient here.
3103     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3104     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
3105     //     there by adding additional byte to instruction encoding.
3106     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
3107     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3108     //     most significant two bits of 5 bit register encoding.
3109 
3110     if (VM_Version::supports_apx_f()) {
3111       __ cmpb(Address(rbx, 0), Assembler::REX2);
3112       __ jccb(Assembler::notEqual, check_rex_prefix);
3113       __ addptr(rbx, 2);
3114       __ bind(check_rex_prefix);
3115     }
3116     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3117     __ jccb(Assembler::notEqual, no_prefix);
3118     __ addptr(rbx, 1);
3119     __ bind(no_prefix);
3120 #ifdef ASSERT
3121     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3122 #endif
3123     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3124     // r12/rsp 0x04
3125     // r13/rbp 0x05
3126     __ movzbq(rcx, Address(rbx, 1));
3127     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3128     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3129     __ cmpptr(rcx, 1);
3130     __ jccb(Assembler::above, not_special);
3131     __ addptr(rbx, 1);
3132     __ bind(not_special);
3133 #ifdef ASSERT
3134     // Verify the correct encoding of the poll we're about to skip.
3135     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3136     __ jcc(Assembler::notEqual, bail);
3137     // Mask out the modrm bits
3138     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3139     // rax encodes to 0, so if the bits are nonzero it's incorrect
3140     __ jcc(Assembler::notZero, bail);
3141 #endif
3142     // Adjust return pc forward to step over the safepoint poll instruction
3143     __ addptr(rbx, 2);
3144     __ movptr(Address(rbp, wordSize), rbx);
3145   }
3146 
3147   __ bind(no_adjust);
3148   // Normal exit, restore registers and exit.
3149   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3150   __ ret(0);
3151 
3152 #ifdef ASSERT
3153   __ bind(bail);
3154   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3155 #endif
3156 
3157   // Make sure all code is generated
3158   masm->flush();
3159 
3160   // Fill-out other meta info
3161   SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3162 
3163   AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, (uint)id, name);
3164   return sp_blob;
3165 }
3166 
3167 //
3168 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3169 //
3170 // Generate a stub that calls into vm to find out the proper destination
3171 // of a java call. All the argument registers are live at this point
3172 // but since this is generic code we don't know what they are and the caller
3173 // must do any gc of the args.
3174 //
3175 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) {
3176   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3177   assert(is_resolve_id(id), "expected a resolve stub id");
3178 
3179   const char* name = SharedRuntime::stub_name(id);
3180   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)id, name);
3181   if (blob != nullptr) {
3182     return blob->as_runtime_stub();
3183   }
3184 
3185   // allocate space for the code
3186   ResourceMark rm;
3187   CodeBuffer buffer(name, 1552, 512);
3188   MacroAssembler* masm = new MacroAssembler(&buffer);
3189 
3190   int frame_size_in_words;
3191 
3192   OopMapSet *oop_maps = new OopMapSet();
3193   OopMap* map = nullptr;
3194 
3195   int start = __ offset();
3196 
3197   // No need to save vector registers since they are caller-saved anyway.
3198   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3199 
3200   int frame_complete = __ offset();
3201 
3202   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3203 
3204   __ mov(c_rarg0, r15_thread);
3205 
3206   __ call(RuntimeAddress(destination));
3207 
3208 
3209   // Set an oopmap for the call site.
3210   // We need this not only for callee-saved registers, but also for volatile
3211   // registers that the compiler might be keeping live across a safepoint.
3212 
3213   oop_maps->add_gc_map( __ offset() - start, map);
3214 
3215   // rax contains the address we are going to jump to assuming no exception got installed
3216 
3217   // clear last_Java_sp
3218   __ reset_last_Java_frame(false);
3219   // check for pending exceptions
3220   Label pending;
3221   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3222   __ jcc(Assembler::notEqual, pending);
3223 
3224   // get the returned Method*
3225   __ get_vm_result_metadata(rbx);
3226   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3227 
3228   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3229 
3230   RegisterSaver::restore_live_registers(masm);
3231 
3232   // We are back to the original state on entry and ready to go.
3233 
3234   __ jmp(rax);
3235 
3236   // Pending exception after the safepoint
3237 
3238   __ bind(pending);
3239 
3240   RegisterSaver::restore_live_registers(masm);
3241 
3242   // exception pending => remove activation and forward to exception handler
3243 
3244   __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3245 
3246   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3247   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3248 
3249   // -------------
3250   // make sure all code is generated
3251   masm->flush();
3252 
3253   // return the  blob
3254   // frame_size_words or bytes??
3255   RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3256 
3257   AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, (uint)id, name);
3258   return rs_blob;
3259 }
3260 
3261 // Continuation point for throwing of implicit exceptions that are
3262 // not handled in the current activation. Fabricates an exception
3263 // oop and initiates normal exception dispatching in this
3264 // frame. Since we need to preserve callee-saved values (currently
3265 // only for C2, but done for C1 as well) we need a callee-saved oop
3266 // map and therefore have to make these stubs into RuntimeStubs
3267 // rather than BufferBlobs.  If the compiler needs all registers to
3268 // be preserved between the fault point and the exception handler
3269 // then it must assume responsibility for that in
3270 // AbstractCompiler::continuation_for_implicit_null_exception or
3271 // continuation_for_implicit_division_by_zero_exception. All other
3272 // implicit exceptions (e.g., NullPointerException or
3273 // AbstractMethodError on entry) are either at call sites or
3274 // otherwise assume that stack unwinding will be initiated, so
3275 // caller saved registers were assumed volatile in the compiler.
3276 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) {
3277   assert(is_throw_id(id), "expected a throw stub id");
3278 
3279   const char* name = SharedRuntime::stub_name(id);
3280 
3281   // Information about frame layout at time of blocking runtime call.
3282   // Note that we only have to preserve callee-saved registers since
3283   // the compilers are responsible for supplying a continuation point
3284   // if they expect all registers to be preserved.
3285   enum layout {
3286     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3287     rbp_off2,
3288     return_off,
3289     return_off2,
3290     framesize // inclusive of return address
3291   };
3292 
3293   int insts_size = 512;
3294   int locs_size  = 64;
3295 
3296   const char* timer_msg = "SharedRuntime generate_throw_exception";
3297   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3298 
3299   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)id, name);
3300   if (blob != nullptr) {
3301     return blob->as_runtime_stub();
3302   }
3303 
3304   ResourceMark rm;
3305   CodeBuffer code(name, insts_size, locs_size);
3306   OopMapSet* oop_maps  = new OopMapSet();
3307   MacroAssembler* masm = new MacroAssembler(&code);
3308 
3309   address start = __ pc();
3310 
3311   // This is an inlined and slightly modified version of call_VM
3312   // which has the ability to fetch the return PC out of
3313   // thread-local storage and also sets up last_Java_sp slightly
3314   // differently than the real call_VM
3315 
3316   __ enter(); // required for proper stackwalking of RuntimeStub frame
3317 
3318   assert(is_even(framesize/2), "sp not 16-byte aligned");
3319 
3320   // return address and rbp are already in place
3321   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3322 
3323   int frame_complete = __ pc() - start;
3324 
3325   // Set up last_Java_sp and last_Java_fp
3326   address the_pc = __ pc();
3327   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3328   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3329 
3330   // Call runtime
3331   __ movptr(c_rarg0, r15_thread);
3332   BLOCK_COMMENT("call runtime_entry");
3333   __ call(RuntimeAddress(runtime_entry));
3334 
3335   // Generate oop map
3336   OopMap* map = new OopMap(framesize, 0);
3337 
3338   oop_maps->add_gc_map(the_pc - start, map);
3339 
3340   __ reset_last_Java_frame(true);
3341 
3342   __ leave(); // required for proper stackwalking of RuntimeStub frame
3343 
3344   // check for pending exceptions
3345 #ifdef ASSERT
3346   Label L;
3347   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3348   __ jcc(Assembler::notEqual, L);
3349   __ should_not_reach_here();
3350   __ bind(L);
3351 #endif // ASSERT
3352   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3353 
3354 
3355   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3356   RuntimeStub* stub =
3357     RuntimeStub::new_runtime_stub(name,
3358                                   &code,
3359                                   frame_complete,
3360                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3361                                   oop_maps, false);
3362   AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, (uint)id, name);
3363 
3364   return stub;
3365 }
3366 
3367 //------------------------------Montgomery multiplication------------------------
3368 //
3369 
3370 #ifndef _WINDOWS
3371 
3372 // Subtract 0:b from carry:a.  Return carry.
3373 static julong
3374 sub(julong a[], julong b[], julong carry, long len) {
3375   long long i = 0, cnt = len;
3376   julong tmp;
3377   asm volatile("clc; "
3378                "0: ; "
3379                "mov (%[b], %[i], 8), %[tmp]; "
3380                "sbb %[tmp], (%[a], %[i], 8); "
3381                "inc %[i]; dec %[cnt]; "
3382                "jne 0b; "
3383                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3384                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3385                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3386                : "memory");
3387   return tmp;
3388 }
3389 
3390 // Multiply (unsigned) Long A by Long B, accumulating the double-
3391 // length result into the accumulator formed of T0, T1, and T2.
3392 #define MACC(A, B, T0, T1, T2)                                  \
3393 do {                                                            \
3394   unsigned long hi, lo;                                         \
3395   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3396            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3397            : "r"(A), "a"(B) : "cc");                            \
3398  } while(0)
3399 
3400 // As above, but add twice the double-length result into the
3401 // accumulator.
3402 #define MACC2(A, B, T0, T1, T2)                                 \
3403 do {                                                            \
3404   unsigned long hi, lo;                                         \
3405   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3406            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3407            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3408            : "r"(A), "a"(B) : "cc");                            \
3409  } while(0)
3410 
3411 #else //_WINDOWS
3412 
3413 static julong
3414 sub(julong a[], julong b[], julong carry, long len) {
3415   long i;
3416   julong tmp;
3417   unsigned char c = 1;
3418   for (i = 0; i < len; i++) {
3419     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3420     a[i] = tmp;
3421   }
3422   c = _addcarry_u64(c, carry, ~0, &tmp);
3423   return tmp;
3424 }
3425 
3426 // Multiply (unsigned) Long A by Long B, accumulating the double-
3427 // length result into the accumulator formed of T0, T1, and T2.
3428 #define MACC(A, B, T0, T1, T2)                          \
3429 do {                                                    \
3430   julong hi, lo;                            \
3431   lo = _umul128(A, B, &hi);                             \
3432   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3433   c = _addcarry_u64(c, hi, T1, &T1);                    \
3434   _addcarry_u64(c, T2, 0, &T2);                         \
3435  } while(0)
3436 
3437 // As above, but add twice the double-length result into the
3438 // accumulator.
3439 #define MACC2(A, B, T0, T1, T2)                         \
3440 do {                                                    \
3441   julong hi, lo;                            \
3442   lo = _umul128(A, B, &hi);                             \
3443   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3444   c = _addcarry_u64(c, hi, T1, &T1);                    \
3445   _addcarry_u64(c, T2, 0, &T2);                         \
3446   c = _addcarry_u64(0, lo, T0, &T0);                    \
3447   c = _addcarry_u64(c, hi, T1, &T1);                    \
3448   _addcarry_u64(c, T2, 0, &T2);                         \
3449  } while(0)
3450 
3451 #endif //_WINDOWS
3452 
3453 // Fast Montgomery multiplication.  The derivation of the algorithm is
3454 // in  A Cryptographic Library for the Motorola DSP56000,
3455 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3456 
3457 static void NOINLINE
3458 montgomery_multiply(julong a[], julong b[], julong n[],
3459                     julong m[], julong inv, int len) {
3460   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3461   int i;
3462 
3463   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3464 
3465   for (i = 0; i < len; i++) {
3466     int j;
3467     for (j = 0; j < i; j++) {
3468       MACC(a[j], b[i-j], t0, t1, t2);
3469       MACC(m[j], n[i-j], t0, t1, t2);
3470     }
3471     MACC(a[i], b[0], t0, t1, t2);
3472     m[i] = t0 * inv;
3473     MACC(m[i], n[0], t0, t1, t2);
3474 
3475     assert(t0 == 0, "broken Montgomery multiply");
3476 
3477     t0 = t1; t1 = t2; t2 = 0;
3478   }
3479 
3480   for (i = len; i < 2*len; i++) {
3481     int j;
3482     for (j = i-len+1; j < len; j++) {
3483       MACC(a[j], b[i-j], t0, t1, t2);
3484       MACC(m[j], n[i-j], t0, t1, t2);
3485     }
3486     m[i-len] = t0;
3487     t0 = t1; t1 = t2; t2 = 0;
3488   }
3489 
3490   while (t0)
3491     t0 = sub(m, n, t0, len);
3492 }
3493 
3494 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3495 // multiplies so it should be up to 25% faster than Montgomery
3496 // multiplication.  However, its loop control is more complex and it
3497 // may actually run slower on some machines.
3498 
3499 static void NOINLINE
3500 montgomery_square(julong a[], julong n[],
3501                   julong m[], julong inv, int len) {
3502   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3503   int i;
3504 
3505   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3506 
3507   for (i = 0; i < len; i++) {
3508     int j;
3509     int end = (i+1)/2;
3510     for (j = 0; j < end; j++) {
3511       MACC2(a[j], a[i-j], t0, t1, t2);
3512       MACC(m[j], n[i-j], t0, t1, t2);
3513     }
3514     if ((i & 1) == 0) {
3515       MACC(a[j], a[j], t0, t1, t2);
3516     }
3517     for (; j < i; j++) {
3518       MACC(m[j], n[i-j], t0, t1, t2);
3519     }
3520     m[i] = t0 * inv;
3521     MACC(m[i], n[0], t0, t1, t2);
3522 
3523     assert(t0 == 0, "broken Montgomery square");
3524 
3525     t0 = t1; t1 = t2; t2 = 0;
3526   }
3527 
3528   for (i = len; i < 2*len; i++) {
3529     int start = i-len+1;
3530     int end = start + (len - start)/2;
3531     int j;
3532     for (j = start; j < end; j++) {
3533       MACC2(a[j], a[i-j], t0, t1, t2);
3534       MACC(m[j], n[i-j], t0, t1, t2);
3535     }
3536     if ((i & 1) == 0) {
3537       MACC(a[j], a[j], t0, t1, t2);
3538     }
3539     for (; j < len; j++) {
3540       MACC(m[j], n[i-j], t0, t1, t2);
3541     }
3542     m[i-len] = t0;
3543     t0 = t1; t1 = t2; t2 = 0;
3544   }
3545 
3546   while (t0)
3547     t0 = sub(m, n, t0, len);
3548 }
3549 
3550 // Swap words in a longword.
3551 static julong swap(julong x) {
3552   return (x << 32) | (x >> 32);
3553 }
3554 
3555 // Copy len longwords from s to d, word-swapping as we go.  The
3556 // destination array is reversed.
3557 static void reverse_words(julong *s, julong *d, int len) {
3558   d += len;
3559   while(len-- > 0) {
3560     d--;
3561     *d = swap(*s);
3562     s++;
3563   }
3564 }
3565 
3566 // The threshold at which squaring is advantageous was determined
3567 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3568 #define MONTGOMERY_SQUARING_THRESHOLD 64
3569 
3570 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3571                                         jint len, jlong inv,
3572                                         jint *m_ints) {
3573   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3574   int longwords = len/2;
3575 
3576   // Make very sure we don't use so much space that the stack might
3577   // overflow.  512 jints corresponds to an 16384-bit integer and
3578   // will use here a total of 8k bytes of stack space.
3579   int divisor = sizeof(julong) * 4;
3580   guarantee(longwords <= 8192 / divisor, "must be");
3581   int total_allocation = longwords * sizeof (julong) * 4;
3582   julong *scratch = (julong *)alloca(total_allocation);
3583 
3584   // Local scratch arrays
3585   julong
3586     *a = scratch + 0 * longwords,
3587     *b = scratch + 1 * longwords,
3588     *n = scratch + 2 * longwords,
3589     *m = scratch + 3 * longwords;
3590 
3591   reverse_words((julong *)a_ints, a, longwords);
3592   reverse_words((julong *)b_ints, b, longwords);
3593   reverse_words((julong *)n_ints, n, longwords);
3594 
3595   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3596 
3597   reverse_words(m, (julong *)m_ints, longwords);
3598 }
3599 
3600 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3601                                       jint len, jlong inv,
3602                                       jint *m_ints) {
3603   assert(len % 2 == 0, "array length in montgomery_square must be even");
3604   int longwords = len/2;
3605 
3606   // Make very sure we don't use so much space that the stack might
3607   // overflow.  512 jints corresponds to an 16384-bit integer and
3608   // will use here a total of 6k bytes of stack space.
3609   int divisor = sizeof(julong) * 3;
3610   guarantee(longwords <= (8192 / divisor), "must be");
3611   int total_allocation = longwords * sizeof (julong) * 3;
3612   julong *scratch = (julong *)alloca(total_allocation);
3613 
3614   // Local scratch arrays
3615   julong
3616     *a = scratch + 0 * longwords,
3617     *n = scratch + 1 * longwords,
3618     *m = scratch + 2 * longwords;
3619 
3620   reverse_words((julong *)a_ints, a, longwords);
3621   reverse_words((julong *)n_ints, n, longwords);
3622 
3623   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3624     ::montgomery_square(a, n, m, (julong)inv, longwords);
3625   } else {
3626     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3627   }
3628 
3629   reverse_words(m, (julong *)m_ints, longwords);
3630 }
3631 
3632 #if INCLUDE_JFR
3633 
3634 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3635 // It returns a jobject handle to the event writer.
3636 // The handle is dereferenced and the return value is the event writer oop.
3637 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3638   enum layout {
3639     rbp_off,
3640     rbpH_off,
3641     return_off,
3642     return_off2,
3643     framesize // inclusive of return address
3644   };
3645 
3646   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id);
3647   CodeBuffer code(name, 1024, 64);
3648   MacroAssembler* masm = new MacroAssembler(&code);
3649   address start = __ pc();
3650 
3651   __ enter();
3652   address the_pc = __ pc();
3653 
3654   int frame_complete = the_pc - start;
3655 
3656   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3657   __ movptr(c_rarg0, r15_thread);
3658   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3659   __ reset_last_Java_frame(true);
3660 
3661   // rax is jobject handle result, unpack and process it through a barrier.
3662   __ resolve_global_jobject(rax, c_rarg0);
3663 
3664   __ leave();
3665   __ ret(0);
3666 
3667   OopMapSet* oop_maps = new OopMapSet();
3668   OopMap* map = new OopMap(framesize, 1);
3669   oop_maps->add_gc_map(frame_complete, map);
3670 
3671   RuntimeStub* stub =
3672     RuntimeStub::new_runtime_stub(name,
3673                                   &code,
3674                                   frame_complete,
3675                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3676                                   oop_maps,
3677                                   false);
3678   return stub;
3679 }
3680 
3681 // For c2: call to return a leased buffer.
3682 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3683   enum layout {
3684     rbp_off,
3685     rbpH_off,
3686     return_off,
3687     return_off2,
3688     framesize // inclusive of return address
3689   };
3690 
3691   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id);
3692   CodeBuffer code(name, 1024, 64);
3693   MacroAssembler* masm = new MacroAssembler(&code);
3694   address start = __ pc();
3695 
3696   __ enter();
3697   address the_pc = __ pc();
3698 
3699   int frame_complete = the_pc - start;
3700 
3701   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3702   __ movptr(c_rarg0, r15_thread);
3703   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3704   __ reset_last_Java_frame(true);
3705 
3706   __ leave();
3707   __ ret(0);
3708 
3709   OopMapSet* oop_maps = new OopMapSet();
3710   OopMap* map = new OopMap(framesize, 1);
3711   oop_maps->add_gc_map(frame_complete, map);
3712 
3713   RuntimeStub* stub =
3714     RuntimeStub::new_runtime_stub(name,
3715                                   &code,
3716                                   frame_complete,
3717                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3718                                   oop_maps,
3719                                   false);
3720   return stub;
3721 }
3722 
3723 #endif // INCLUDE_JFR
3724