1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/timerTrace.hpp"
  56 #include "runtime/vframeArray.hpp"
  57 #include "runtime/vm_version.hpp"
  58 #include "utilities/align.hpp"
  59 #include "utilities/checkedCast.hpp"
  60 #include "utilities/formatBuffer.hpp"
  61 #include "vmreg_x86.inline.hpp"
  62 #ifdef COMPILER1
  63 #include "c1/c1_Runtime1.hpp"
  64 #endif
  65 #ifdef COMPILER2
  66 #include "opto/runtime.hpp"
  67 #endif
  68 #if INCLUDE_JVMCI
  69 #include "jvmci/jvmciJavaClasses.hpp"
  70 #endif
  71 
  72 #define __ masm->
  73 
  74 #ifdef PRODUCT
  75 #define BLOCK_COMMENT(str) /* nothing */
  76 #else
  77 #define BLOCK_COMMENT(str) __ block_comment(str)
  78 #endif // PRODUCT
  79 
  80 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  81 
  82 class RegisterSaver {
  83   // Capture info about frame layout.  Layout offsets are in jint
  84   // units because compiler frame slots are jints.
  85 #define XSAVE_AREA_BEGIN 160
  86 #define XSAVE_AREA_YMM_BEGIN 576
  87 #define XSAVE_AREA_EGPRS 960
  88 #define XSAVE_AREA_OPMASK_BEGIN 1088
  89 #define XSAVE_AREA_ZMM_BEGIN 1152
  90 #define XSAVE_AREA_UPPERBANK 1664
  91 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  92 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  93 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  94 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  95 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  96   enum layout {
  97     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  98     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
  99     DEF_XMM_OFFS(0),
 100     DEF_XMM_OFFS(1),
 101     // 2..15 are implied in range usage
 102     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 103     DEF_YMM_OFFS(0),
 104     DEF_YMM_OFFS(1),
 105     // 2..15 are implied in range usage
 106     r31_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 107     r31H_off,
 108     r30_off, r30H_off,
 109     r29_off, r29H_off,
 110     r28_off, r28H_off,
 111     r27_off, r27H_off,
 112     r26_off, r26H_off,
 113     r25_off, r25H_off,
 114     r24_off, r24H_off,
 115     r23_off, r23H_off,
 116     r22_off, r22H_off,
 117     r21_off, r21H_off,
 118     r20_off, r20H_off,
 119     r19_off, r19H_off,
 120     r18_off, r18H_off,
 121     r17_off, r17H_off,
 122     r16_off, r16H_off,
 123     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 124     DEF_OPMASK_OFFS(0),
 125     DEF_OPMASK_OFFS(1),
 126     // 2..7 are implied in range usage
 127     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 128     DEF_ZMM_OFFS(0),
 129     DEF_ZMM_OFFS(1),
 130     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 131     DEF_ZMM_UPPER_OFFS(16),
 132     DEF_ZMM_UPPER_OFFS(17),
 133     // 18..31 are implied in range usage
 134     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 135     fpu_stateH_end,
 136     r15_off, r15H_off,
 137     r14_off, r14H_off,
 138     r13_off, r13H_off,
 139     r12_off, r12H_off,
 140     r11_off, r11H_off,
 141     r10_off, r10H_off,
 142     r9_off,  r9H_off,
 143     r8_off,  r8H_off,
 144     rdi_off, rdiH_off,
 145     rsi_off, rsiH_off,
 146     ignore_off, ignoreH_off,  // extra copy of rbp
 147     rsp_off, rspH_off,
 148     rbx_off, rbxH_off,
 149     rdx_off, rdxH_off,
 150     rcx_off, rcxH_off,
 151     rax_off, raxH_off,
 152     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 153     align_off, alignH_off,
 154     flags_off, flagsH_off,
 155     // The frame sender code expects that rbp will be in the "natural" place and
 156     // will override any oopMap setting for it. We must therefore force the layout
 157     // so that it agrees with the frame sender code.
 158     rbp_off, rbpH_off,        // copy of rbp we will restore
 159     return_off, returnH_off,  // slot for return address
 160     reg_save_size             // size in compiler stack slots
 161   };
 162 
 163  public:
 164   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 165   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 166 
 167   // Offsets into the register save area
 168   // Used by deoptimization when it is managing result register
 169   // values on its own
 170 
 171   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 172   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 173   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 174   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 175   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 176 
 177   // During deoptimization only the result registers need to be restored,
 178   // all the other values have already been extracted.
 179   static void restore_result_registers(MacroAssembler* masm);
 180 };
 181 
 182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 183   int off = 0;
 184   int num_xmm_regs = XMMRegister::available_xmm_registers();
 185 #if COMPILER2_OR_JVMCI
 186   if (save_wide_vectors && UseAVX == 0) {
 187     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 188   }
 189   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 190 #else
 191   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 192 #endif
 193 
 194   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 195   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 196   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 197   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 198   // CodeBlob frame size is in words.
 199   int frame_size_in_words = frame_size_in_bytes / wordSize;
 200   *total_frame_words = frame_size_in_words;
 201 
 202   // Save registers, fpu state, and flags.
 203   // We assume caller has already pushed the return address onto the
 204   // stack, so rsp is 8-byte aligned here.
 205   // We push rpb twice in this sequence because we want the real rbp
 206   // to be under the return like a normal enter.
 207 
 208   __ enter();          // rsp becomes 16-byte aligned here
 209   __ pushf();
 210   // Make sure rsp stays 16-byte aligned
 211   __ subq(rsp, 8);
 212   // Push CPU state in multiple of 16 bytes
 213   __ save_legacy_gprs();
 214   __ push_FPU_state();
 215 
 216 
 217   // push cpu state handles this on EVEX enabled targets
 218   if (save_wide_vectors) {
 219     // Save upper half of YMM registers(0..15)
 220     int base_addr = XSAVE_AREA_YMM_BEGIN;
 221     for (int n = 0; n < 16; n++) {
 222       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 223     }
 224     if (VM_Version::supports_evex()) {
 225       // Save upper half of ZMM registers(0..15)
 226       base_addr = XSAVE_AREA_ZMM_BEGIN;
 227       for (int n = 0; n < 16; n++) {
 228         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 229       }
 230       // Save full ZMM registers(16..num_xmm_regs)
 231       base_addr = XSAVE_AREA_UPPERBANK;
 232       off = 0;
 233       int vector_len = Assembler::AVX_512bit;
 234       for (int n = 16; n < num_xmm_regs; n++) {
 235         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 236       }
 237 #if COMPILER2_OR_JVMCI
 238       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 239       off = 0;
 240       for(int n = 0; n < KRegister::number_of_registers; n++) {
 241         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 242       }
 243 #endif
 244     }
 245   } else {
 246     if (VM_Version::supports_evex()) {
 247       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 248       int base_addr = XSAVE_AREA_UPPERBANK;
 249       off = 0;
 250       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 251       for (int n = 16; n < num_xmm_regs; n++) {
 252         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 253       }
 254 #if COMPILER2_OR_JVMCI
 255       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 256       off = 0;
 257       for(int n = 0; n < KRegister::number_of_registers; n++) {
 258         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 259       }
 260 #endif
 261     }
 262   }
 263 
 264 #if COMPILER2_OR_JVMCI
 265   if (UseAPX) {
 266       int base_addr = XSAVE_AREA_EGPRS;
 267       off = 0;
 268       for(int n = 16; n < Register::number_of_registers; n++) {
 269         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 270       }
 271   }
 272 #endif
 273 
 274   __ vzeroupper();
 275   if (frame::arg_reg_save_area_bytes != 0) {
 276     // Allocate argument register save area
 277     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 278   }
 279 
 280   // Set an oopmap for the call site.  This oopmap will map all
 281   // oop-registers and debug-info registers as callee-saved.  This
 282   // will allow deoptimization at this safepoint to find all possible
 283   // debug-info recordings, as well as let GC find all oops.
 284 
 285   OopMapSet *oop_maps = new OopMapSet();
 286   OopMap* map = new OopMap(frame_size_in_slots, 0);
 287 
 288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 289 
 290   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 291   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 294   // rbp location is known implicitly by the frame sender code, needs no oopmap
 295   // and the location where rbp was saved by is ignored
 296   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 297   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 306 
 307   if (UseAPX) {
 308     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 309     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 324   }
 325   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 326   // on EVEX enabled targets, we get it included in the xsave area
 327   off = xmm0_off;
 328   int delta = xmm1_off - off;
 329   for (int n = 0; n < 16; n++) {
 330     XMMRegister xmm_name = as_XMMRegister(n);
 331     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 332     off += delta;
 333   }
 334   if (UseAVX > 2) {
 335     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 336     off = zmm16_off;
 337     delta = zmm17_off - off;
 338     for (int n = 16; n < num_xmm_regs; n++) {
 339       XMMRegister zmm_name = as_XMMRegister(n);
 340       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 341       off += delta;
 342     }
 343   }
 344 
 345 #if COMPILER2_OR_JVMCI
 346   if (save_wide_vectors) {
 347     // Save upper half of YMM registers(0..15)
 348     off = ymm0_off;
 349     delta = ymm1_off - ymm0_off;
 350     for (int n = 0; n < 16; n++) {
 351       XMMRegister ymm_name = as_XMMRegister(n);
 352       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 353       off += delta;
 354     }
 355     if (VM_Version::supports_evex()) {
 356       // Save upper half of ZMM registers(0..15)
 357       off = zmm0_off;
 358       delta = zmm1_off - zmm0_off;
 359       for (int n = 0; n < 16; n++) {
 360         XMMRegister zmm_name = as_XMMRegister(n);
 361         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 362         off += delta;
 363       }
 364     }
 365   }
 366 #endif // COMPILER2_OR_JVMCI
 367 
 368   // %%% These should all be a waste but we'll keep things as they were for now
 369   if (true) {
 370     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 371     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 374     // rbp location is known implicitly by the frame sender code, needs no oopmap
 375     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 376     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 385     if (UseAPX) {
 386       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 387       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 402     }
 403     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 404     // on EVEX enabled targets, we get it included in the xsave area
 405     off = xmm0H_off;
 406     delta = xmm1H_off - off;
 407     for (int n = 0; n < 16; n++) {
 408       XMMRegister xmm_name = as_XMMRegister(n);
 409       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 410       off += delta;
 411     }
 412     if (UseAVX > 2) {
 413       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 414       off = zmm16H_off;
 415       delta = zmm17H_off - off;
 416       for (int n = 16; n < num_xmm_regs; n++) {
 417         XMMRegister zmm_name = as_XMMRegister(n);
 418         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 419         off += delta;
 420       }
 421     }
 422   }
 423 
 424   return map;
 425 }
 426 
 427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 428   int num_xmm_regs = XMMRegister::available_xmm_registers();
 429   if (frame::arg_reg_save_area_bytes != 0) {
 430     // Pop arg register save area
 431     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 432   }
 433 
 434 #if COMPILER2_OR_JVMCI
 435   if (restore_wide_vectors) {
 436     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 437     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 438   }
 439 #else
 440   assert(!restore_wide_vectors, "vectors are generated only by C2");
 441 #endif
 442 
 443   __ vzeroupper();
 444 
 445   // On EVEX enabled targets everything is handled in pop fpu state
 446   if (restore_wide_vectors) {
 447     // Restore upper half of YMM registers (0..15)
 448     int base_addr = XSAVE_AREA_YMM_BEGIN;
 449     for (int n = 0; n < 16; n++) {
 450       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 451     }
 452     if (VM_Version::supports_evex()) {
 453       // Restore upper half of ZMM registers (0..15)
 454       base_addr = XSAVE_AREA_ZMM_BEGIN;
 455       for (int n = 0; n < 16; n++) {
 456         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 457       }
 458       // Restore full ZMM registers(16..num_xmm_regs)
 459       base_addr = XSAVE_AREA_UPPERBANK;
 460       int vector_len = Assembler::AVX_512bit;
 461       int off = 0;
 462       for (int n = 16; n < num_xmm_regs; n++) {
 463         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 464       }
 465 #if COMPILER2_OR_JVMCI
 466       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 467       off = 0;
 468       for (int n = 0; n < KRegister::number_of_registers; n++) {
 469         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 470       }
 471 #endif
 472     }
 473   } else {
 474     if (VM_Version::supports_evex()) {
 475       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 476       int base_addr = XSAVE_AREA_UPPERBANK;
 477       int off = 0;
 478       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 479       for (int n = 16; n < num_xmm_regs; n++) {
 480         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 481       }
 482 #if COMPILER2_OR_JVMCI
 483       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 484       off = 0;
 485       for (int n = 0; n < KRegister::number_of_registers; n++) {
 486         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 487       }
 488 #endif
 489     }
 490   }
 491 
 492 #if COMPILER2_OR_JVMCI
 493   if (UseAPX) {
 494     int base_addr = XSAVE_AREA_EGPRS;
 495     int off = 0;
 496     for (int n = 16; n < Register::number_of_registers; n++) {
 497       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 498     }
 499   }
 500 #endif
 501 
 502   // Recover CPU state
 503   __ pop_FPU_state();
 504   __ restore_legacy_gprs();
 505   __ addq(rsp, 8);
 506   __ popf();
 507   // Get the rbp described implicitly by the calling convention (no oopMap)
 508   __ pop(rbp);
 509 }
 510 
 511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 512 
 513   // Just restore result register. Only used by deoptimization. By
 514   // now any callee save register that needs to be restored to a c2
 515   // caller of the deoptee has been extracted into the vframeArray
 516   // and will be stuffed into the c2i adapter we create for later
 517   // restoration so only result registers need to be restored here.
 518 
 519   // Restore fp result register
 520   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 521   // Restore integer result register
 522   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 523   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 524 
 525   // Pop all of the register save are off the stack except the return address
 526   __ addptr(rsp, return_offset_in_bytes());
 527 }
 528 
 529 // Is vector's size (in bytes) bigger than a size saved by default?
 530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 531 bool SharedRuntime::is_wide_vector(int size) {
 532   return size > 16;
 533 }
 534 
 535 // ---------------------------------------------------------------------------
 536 // Read the array of BasicTypes from a signature, and compute where the
 537 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 538 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 539 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 540 // as framesizes are fixed.
 541 // VMRegImpl::stack0 refers to the first slot 0(sp).
 542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 543 // Register up to Register::number_of_registers are the 64-bit
 544 // integer registers.
 545 
 546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 547 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 548 // units regardless of build. Of course for i486 there is no 64 bit build
 549 
 550 // The Java calling convention is a "shifted" version of the C ABI.
 551 // By skipping the first C ABI register we can call non-static jni methods
 552 // with small numbers of arguments without having to shuffle the arguments
 553 // at all. Since we control the java ABI we ought to at least get some
 554 // advantage out of it.
 555 
 556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 557                                            VMRegPair *regs,
 558                                            int total_args_passed) {
 559 
 560   // Create the mapping between argument positions and
 561   // registers.
 562   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 563     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 564   };
 565   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 566     j_farg0, j_farg1, j_farg2, j_farg3,
 567     j_farg4, j_farg5, j_farg6, j_farg7
 568   };
 569 
 570 
 571   uint int_args = 0;
 572   uint fp_args = 0;
 573   uint stk_args = 0;
 574 
 575   for (int i = 0; i < total_args_passed; i++) {
 576     switch (sig_bt[i]) {
 577     case T_BOOLEAN:
 578     case T_CHAR:
 579     case T_BYTE:
 580     case T_SHORT:
 581     case T_INT:
 582       if (int_args < Argument::n_int_register_parameters_j) {
 583         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 584       } else {
 585         stk_args = align_up(stk_args, 2);
 586         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 587         stk_args += 1;
 588       }
 589       break;
 590     case T_VOID:
 591       // halves of T_LONG or T_DOUBLE
 592       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 593       regs[i].set_bad();
 594       break;
 595     case T_LONG:
 596       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 597       // fall through
 598     case T_OBJECT:
 599     case T_ARRAY:
 600     case T_ADDRESS:
 601       if (int_args < Argument::n_int_register_parameters_j) {
 602         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 603       } else {
 604         stk_args = align_up(stk_args, 2);
 605         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 606         stk_args += 2;
 607       }
 608       break;
 609     case T_FLOAT:
 610       if (fp_args < Argument::n_float_register_parameters_j) {
 611         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 612       } else {
 613         stk_args = align_up(stk_args, 2);
 614         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 615         stk_args += 1;
 616       }
 617       break;
 618     case T_DOUBLE:
 619       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 620       if (fp_args < Argument::n_float_register_parameters_j) {
 621         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 622       } else {
 623         stk_args = align_up(stk_args, 2);
 624         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 625         stk_args += 2;
 626       }
 627       break;
 628     default:
 629       ShouldNotReachHere();
 630       break;
 631     }
 632   }
 633 
 634   return stk_args;
 635 }
 636 
 637 // Patch the callers callsite with entry to compiled code if it exists.
 638 static void patch_callers_callsite(MacroAssembler *masm) {
 639   Label L;
 640   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 641   __ jcc(Assembler::equal, L);
 642 
 643   // Save the current stack pointer
 644   __ mov(r13, rsp);
 645   // Schedule the branch target address early.
 646   // Call into the VM to patch the caller, then jump to compiled callee
 647   // rax isn't live so capture return address while we easily can
 648   __ movptr(rax, Address(rsp, 0));
 649 
 650   // align stack so push_CPU_state doesn't fault
 651   __ andptr(rsp, -(StackAlignmentInBytes));
 652   __ push_CPU_state();
 653   __ vzeroupper();
 654   // VM needs caller's callsite
 655   // VM needs target method
 656   // This needs to be a long call since we will relocate this adapter to
 657   // the codeBuffer and it may not reach
 658 
 659   // Allocate argument register save area
 660   if (frame::arg_reg_save_area_bytes != 0) {
 661     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 662   }
 663   __ mov(c_rarg0, rbx);
 664   __ mov(c_rarg1, rax);
 665   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 666 
 667   // De-allocate argument register save area
 668   if (frame::arg_reg_save_area_bytes != 0) {
 669     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 670   }
 671 
 672   __ vzeroupper();
 673   __ pop_CPU_state();
 674   // restore sp
 675   __ mov(rsp, r13);
 676   __ bind(L);
 677 }
 678 
 679 
 680 static void gen_c2i_adapter(MacroAssembler *masm,
 681                             int total_args_passed,
 682                             int comp_args_on_stack,
 683                             const BasicType *sig_bt,
 684                             const VMRegPair *regs,
 685                             Label& skip_fixup) {
 686   // Before we get into the guts of the C2I adapter, see if we should be here
 687   // at all.  We've come from compiled code and are attempting to jump to the
 688   // interpreter, which means the caller made a static call to get here
 689   // (vcalls always get a compiled target if there is one).  Check for a
 690   // compiled target.  If there is one, we need to patch the caller's call.
 691   patch_callers_callsite(masm);
 692 
 693   __ bind(skip_fixup);
 694 
 695   // Since all args are passed on the stack, total_args_passed *
 696   // Interpreter::stackElementSize is the space we need.
 697 
 698   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 699 
 700   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 701 
 702   // stack is aligned, keep it that way
 703   // This is not currently needed or enforced by the interpreter, but
 704   // we might as well conform to the ABI.
 705   extraspace = align_up(extraspace, 2*wordSize);
 706 
 707   // set senderSP value
 708   __ lea(r13, Address(rsp, wordSize));
 709 
 710 #ifdef ASSERT
 711   __ check_stack_alignment(r13, "sender stack not aligned");
 712 #endif
 713   if (extraspace > 0) {
 714     // Pop the return address
 715     __ pop(rax);
 716 
 717     __ subptr(rsp, extraspace);
 718 
 719     // Push the return address
 720     __ push(rax);
 721 
 722     // Account for the return address location since we store it first rather
 723     // than hold it in a register across all the shuffling
 724     extraspace += wordSize;
 725   }
 726 
 727 #ifdef ASSERT
 728   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 729 #endif
 730 
 731   // Now write the args into the outgoing interpreter space
 732   for (int i = 0; i < total_args_passed; i++) {
 733     if (sig_bt[i] == T_VOID) {
 734       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 735       continue;
 736     }
 737 
 738     // offset to start parameters
 739     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 740     int next_off = st_off - Interpreter::stackElementSize;
 741 
 742     // Say 4 args:
 743     // i   st_off
 744     // 0   32 T_LONG
 745     // 1   24 T_VOID
 746     // 2   16 T_OBJECT
 747     // 3    8 T_BOOL
 748     // -    0 return address
 749     //
 750     // However to make thing extra confusing. Because we can fit a long/double in
 751     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 752     // leaves one slot empty and only stores to a single slot. In this case the
 753     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 754 
 755     VMReg r_1 = regs[i].first();
 756     VMReg r_2 = regs[i].second();
 757     if (!r_1->is_valid()) {
 758       assert(!r_2->is_valid(), "");
 759       continue;
 760     }
 761     if (r_1->is_stack()) {
 762       // memory to memory use rax
 763       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 764       if (!r_2->is_valid()) {
 765         // sign extend??
 766         __ movl(rax, Address(rsp, ld_off));
 767         __ movptr(Address(rsp, st_off), rax);
 768 
 769       } else {
 770 
 771         __ movq(rax, Address(rsp, ld_off));
 772 
 773         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 774         // T_DOUBLE and T_LONG use two slots in the interpreter
 775         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 776           // ld_off == LSW, ld_off+wordSize == MSW
 777           // st_off == MSW, next_off == LSW
 778           __ movq(Address(rsp, next_off), rax);
 779 #ifdef ASSERT
 780           // Overwrite the unused slot with known junk
 781           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 782           __ movptr(Address(rsp, st_off), rax);
 783 #endif /* ASSERT */
 784         } else {
 785           __ movq(Address(rsp, st_off), rax);
 786         }
 787       }
 788     } else if (r_1->is_Register()) {
 789       Register r = r_1->as_Register();
 790       if (!r_2->is_valid()) {
 791         // must be only an int (or less ) so move only 32bits to slot
 792         // why not sign extend??
 793         __ movl(Address(rsp, st_off), r);
 794       } else {
 795         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 796         // T_DOUBLE and T_LONG use two slots in the interpreter
 797         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 798           // long/double in gpr
 799 #ifdef ASSERT
 800           // Overwrite the unused slot with known junk
 801           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 802           __ movptr(Address(rsp, st_off), rax);
 803 #endif /* ASSERT */
 804           __ movq(Address(rsp, next_off), r);
 805         } else {
 806           __ movptr(Address(rsp, st_off), r);
 807         }
 808       }
 809     } else {
 810       assert(r_1->is_XMMRegister(), "");
 811       if (!r_2->is_valid()) {
 812         // only a float use just part of the slot
 813         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 814       } else {
 815 #ifdef ASSERT
 816         // Overwrite the unused slot with known junk
 817         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 818         __ movptr(Address(rsp, st_off), rax);
 819 #endif /* ASSERT */
 820         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 821       }
 822     }
 823   }
 824 
 825   // Schedule the branch target address early.
 826   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 827   __ jmp(rcx);
 828 }
 829 
 830 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 831                         address code_start, address code_end,
 832                         Label& L_ok) {
 833   Label L_fail;
 834   __ lea(temp_reg, AddressLiteral(code_start, relocInfo::none));
 835   __ cmpptr(pc_reg, temp_reg);
 836   __ jcc(Assembler::belowEqual, L_fail);
 837   __ lea(temp_reg, AddressLiteral(code_end, relocInfo::none));
 838   __ cmpptr(pc_reg, temp_reg);
 839   __ jcc(Assembler::below, L_ok);
 840   __ bind(L_fail);
 841 }
 842 
 843 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 844                                     int total_args_passed,
 845                                     int comp_args_on_stack,
 846                                     const BasicType *sig_bt,
 847                                     const VMRegPair *regs) {
 848 
 849   // Note: r13 contains the senderSP on entry. We must preserve it since
 850   // we may do a i2c -> c2i transition if we lose a race where compiled
 851   // code goes non-entrant while we get args ready.
 852   // In addition we use r13 to locate all the interpreter args as
 853   // we must align the stack to 16 bytes on an i2c entry else we
 854   // lose alignment we expect in all compiled code and register
 855   // save code can segv when fxsave instructions find improperly
 856   // aligned stack pointer.
 857 
 858   // Adapters can be frameless because they do not require the caller
 859   // to perform additional cleanup work, such as correcting the stack pointer.
 860   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 861   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 862   // even if a callee has modified the stack pointer.
 863   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 864   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 865   // up via the senderSP register).
 866   // In other words, if *either* the caller or callee is interpreted, we can
 867   // get the stack pointer repaired after a call.
 868   // This is why c2i and i2c adapters cannot be indefinitely composed.
 869   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 870   // both caller and callee would be compiled methods, and neither would
 871   // clean up the stack pointer changes performed by the two adapters.
 872   // If this happens, control eventually transfers back to the compiled
 873   // caller, but with an uncorrected stack, causing delayed havoc.
 874 
 875   if (VerifyAdapterCalls &&
 876       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 877     // So, let's test for cascading c2i/i2c adapters right now.
 878     //  assert(Interpreter::contains($return_addr) ||
 879     //         StubRoutines::contains($return_addr),
 880     //         "i2c adapter must return to an interpreter frame");
 881     __ block_comment("verify_i2c { ");
 882     // Pick up the return address
 883     __ movptr(rax, Address(rsp, 0));
 884     Label L_ok;
 885     if (Interpreter::code() != nullptr) {
 886       range_check(masm, rax, r11,
 887                   Interpreter::code()->code_start(),
 888                   Interpreter::code()->code_end(),
 889                   L_ok);
 890     }
 891     if (StubRoutines::initial_stubs_code() != nullptr) {
 892       range_check(masm, rax, r11,
 893                   StubRoutines::initial_stubs_code()->code_begin(),
 894                   StubRoutines::initial_stubs_code()->code_end(),
 895                   L_ok);
 896     }
 897     if (StubRoutines::final_stubs_code() != nullptr) {
 898       range_check(masm, rax, r11,
 899                   StubRoutines::final_stubs_code()->code_begin(),
 900                   StubRoutines::final_stubs_code()->code_end(),
 901                   L_ok);
 902     }
 903     const char* msg = "i2c adapter must return to an interpreter frame";
 904     __ block_comment(msg);
 905     __ stop(msg);
 906     __ bind(L_ok);
 907     __ block_comment("} verify_i2ce ");
 908   }
 909 
 910   // Must preserve original SP for loading incoming arguments because
 911   // we need to align the outgoing SP for compiled code.
 912   __ movptr(r11, rsp);
 913 
 914   // Pick up the return address
 915   __ pop(rax);
 916 
 917   // Convert 4-byte c2 stack slots to words.
 918   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 919 
 920   if (comp_args_on_stack) {
 921     __ subptr(rsp, comp_words_on_stack * wordSize);
 922   }
 923 
 924   // Ensure compiled code always sees stack at proper alignment
 925   __ andptr(rsp, -16);
 926 
 927   // push the return address and misalign the stack that youngest frame always sees
 928   // as far as the placement of the call instruction
 929   __ push(rax);
 930 
 931   // Put saved SP in another register
 932   const Register saved_sp = rax;
 933   __ movptr(saved_sp, r11);
 934 
 935   // Will jump to the compiled code just as if compiled code was doing it.
 936   // Pre-load the register-jump target early, to schedule it better.
 937   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 938 
 939 #if INCLUDE_JVMCI
 940   if (EnableJVMCI) {
 941     // check if this call should be routed towards a specific entry point
 942     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 943     Label no_alternative_target;
 944     __ jcc(Assembler::equal, no_alternative_target);
 945     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 946     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 947     __ bind(no_alternative_target);
 948   }
 949 #endif // INCLUDE_JVMCI
 950 
 951   // Now generate the shuffle code.  Pick up all register args and move the
 952   // rest through the floating point stack top.
 953   for (int i = 0; i < total_args_passed; i++) {
 954     if (sig_bt[i] == T_VOID) {
 955       // Longs and doubles are passed in native word order, but misaligned
 956       // in the 32-bit build.
 957       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 958       continue;
 959     }
 960 
 961     // Pick up 0, 1 or 2 words from SP+offset.
 962 
 963     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 964             "scrambled load targets?");
 965     // Load in argument order going down.
 966     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 967     // Point to interpreter value (vs. tag)
 968     int next_off = ld_off - Interpreter::stackElementSize;
 969     //
 970     //
 971     //
 972     VMReg r_1 = regs[i].first();
 973     VMReg r_2 = regs[i].second();
 974     if (!r_1->is_valid()) {
 975       assert(!r_2->is_valid(), "");
 976       continue;
 977     }
 978     if (r_1->is_stack()) {
 979       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 980       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 981 
 982       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 983       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 984       // will be generated.
 985       if (!r_2->is_valid()) {
 986         // sign extend???
 987         __ movl(r13, Address(saved_sp, ld_off));
 988         __ movptr(Address(rsp, st_off), r13);
 989       } else {
 990         //
 991         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 992         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 993         // So we must adjust where to pick up the data to match the interpreter.
 994         //
 995         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 996         // are accessed as negative so LSW is at LOW address
 997 
 998         // ld_off is MSW so get LSW
 999         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
1000                            next_off : ld_off;
1001         __ movq(r13, Address(saved_sp, offset));
1002         // st_off is LSW (i.e. reg.first())
1003         __ movq(Address(rsp, st_off), r13);
1004       }
1005     } else if (r_1->is_Register()) {  // Register argument
1006       Register r = r_1->as_Register();
1007       assert(r != rax, "must be different");
1008       if (r_2->is_valid()) {
1009         //
1010         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1011         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1012         // So we must adjust where to pick up the data to match the interpreter.
1013 
1014         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
1015                            next_off : ld_off;
1016 
1017         // this can be a misaligned move
1018         __ movq(r, Address(saved_sp, offset));
1019       } else {
1020         // sign extend and use a full word?
1021         __ movl(r, Address(saved_sp, ld_off));
1022       }
1023     } else {
1024       if (!r_2->is_valid()) {
1025         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1026       } else {
1027         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1028       }
1029     }
1030   }
1031 
1032   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1033 
1034   // 6243940 We might end up in handle_wrong_method if
1035   // the callee is deoptimized as we race thru here. If that
1036   // happens we don't want to take a safepoint because the
1037   // caller frame will look interpreted and arguments are now
1038   // "compiled" so it is much better to make this transition
1039   // invisible to the stack walking code. Unfortunately if
1040   // we try and find the callee by normal means a safepoint
1041   // is possible. So we stash the desired callee in the thread
1042   // and the vm will find there should this case occur.
1043 
1044   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1045 
1046   // put Method* where a c2i would expect should we end up there
1047   // only needed because eof c2 resolve stubs return Method* as a result in
1048   // rax
1049   __ mov(rax, rbx);
1050   __ jmp(r11);
1051 }
1052 
1053 // ---------------------------------------------------------------
1054 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1055                                                             int total_args_passed,
1056                                                             int comp_args_on_stack,
1057                                                             const BasicType *sig_bt,
1058                                                             const VMRegPair *regs,
1059                                                             AdapterFingerPrint* fingerprint) {
1060   address i2c_entry = __ pc();
1061 
1062   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1063 
1064   // -------------------------------------------------------------------------
1065   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1066   // to the interpreter.  The args start out packed in the compiled layout.  They
1067   // need to be unpacked into the interpreter layout.  This will almost always
1068   // require some stack space.  We grow the current (compiled) stack, then repack
1069   // the args.  We  finally end in a jump to the generic interpreter entry point.
1070   // On exit from the interpreter, the interpreter will restore our SP (lest the
1071   // compiled code, which relies solely on SP and not RBP, get sick).
1072 
1073   address c2i_unverified_entry = __ pc();
1074   Label skip_fixup;
1075 
1076   Register data = rax;
1077   Register receiver = j_rarg0;
1078   Register temp = rbx;
1079 
1080   {
1081     __ ic_check(1 /* end_alignment */);
1082     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1083     // Method might have been compiled since the call site was patched to
1084     // interpreted if that is the case treat it as a miss so we can get
1085     // the call site corrected.
1086     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1087     __ jcc(Assembler::equal, skip_fixup);
1088     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1089   }
1090 
1091   address c2i_entry = __ pc();
1092 
1093   // Class initialization barrier for static methods
1094   address c2i_no_clinit_check_entry = nullptr;
1095   if (VM_Version::supports_fast_class_init_checks()) {
1096     Label L_skip_barrier;
1097     Register method = rbx;
1098 
1099     { // Bypass the barrier for non-static methods
1100       Register flags = rscratch1;
1101       __ movl(flags, Address(method, Method::access_flags_offset()));
1102       __ testl(flags, JVM_ACC_STATIC);
1103       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1104     }
1105 
1106     Register klass = rscratch1;
1107     __ load_method_holder(klass, method);
1108     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1109 
1110     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1111 
1112     __ bind(L_skip_barrier);
1113     c2i_no_clinit_check_entry = __ pc();
1114   }
1115 
1116   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1117   bs->c2i_entry_barrier(masm);
1118 
1119   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1120 
1121   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1122 }
1123 
1124 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1125                                          VMRegPair *regs,
1126                                          int total_args_passed) {
1127 
1128 // We return the amount of VMRegImpl stack slots we need to reserve for all
1129 // the arguments NOT counting out_preserve_stack_slots.
1130 
1131 // NOTE: These arrays will have to change when c1 is ported
1132 #ifdef _WIN64
1133     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1134       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1135     };
1136     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1137       c_farg0, c_farg1, c_farg2, c_farg3
1138     };
1139 #else
1140     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1141       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1142     };
1143     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1144       c_farg0, c_farg1, c_farg2, c_farg3,
1145       c_farg4, c_farg5, c_farg6, c_farg7
1146     };
1147 #endif // _WIN64
1148 
1149 
1150     uint int_args = 0;
1151     uint fp_args = 0;
1152     uint stk_args = 0; // inc by 2 each time
1153 
1154     for (int i = 0; i < total_args_passed; i++) {
1155       switch (sig_bt[i]) {
1156       case T_BOOLEAN:
1157       case T_CHAR:
1158       case T_BYTE:
1159       case T_SHORT:
1160       case T_INT:
1161         if (int_args < Argument::n_int_register_parameters_c) {
1162           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1163 #ifdef _WIN64
1164           fp_args++;
1165           // Allocate slots for callee to stuff register args the stack.
1166           stk_args += 2;
1167 #endif
1168         } else {
1169           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1170           stk_args += 2;
1171         }
1172         break;
1173       case T_LONG:
1174         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1175         // fall through
1176       case T_OBJECT:
1177       case T_ARRAY:
1178       case T_ADDRESS:
1179       case T_METADATA:
1180         if (int_args < Argument::n_int_register_parameters_c) {
1181           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1182 #ifdef _WIN64
1183           fp_args++;
1184           stk_args += 2;
1185 #endif
1186         } else {
1187           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1188           stk_args += 2;
1189         }
1190         break;
1191       case T_FLOAT:
1192         if (fp_args < Argument::n_float_register_parameters_c) {
1193           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1194 #ifdef _WIN64
1195           int_args++;
1196           // Allocate slots for callee to stuff register args the stack.
1197           stk_args += 2;
1198 #endif
1199         } else {
1200           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1201           stk_args += 2;
1202         }
1203         break;
1204       case T_DOUBLE:
1205         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1206         if (fp_args < Argument::n_float_register_parameters_c) {
1207           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1208 #ifdef _WIN64
1209           int_args++;
1210           // Allocate slots for callee to stuff register args the stack.
1211           stk_args += 2;
1212 #endif
1213         } else {
1214           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1215           stk_args += 2;
1216         }
1217         break;
1218       case T_VOID: // Halves of longs and doubles
1219         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1220         regs[i].set_bad();
1221         break;
1222       default:
1223         ShouldNotReachHere();
1224         break;
1225       }
1226     }
1227 #ifdef _WIN64
1228   // windows abi requires that we always allocate enough stack space
1229   // for 4 64bit registers to be stored down.
1230   if (stk_args < 8) {
1231     stk_args = 8;
1232   }
1233 #endif // _WIN64
1234 
1235   return stk_args;
1236 }
1237 
1238 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1239                                              uint num_bits,
1240                                              uint total_args_passed) {
1241   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1242          "only certain vector sizes are supported for now");
1243 
1244   static const XMMRegister VEC_ArgReg[32] = {
1245      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1246      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1247     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1248     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1249   };
1250 
1251   uint stk_args = 0;
1252   uint fp_args = 0;
1253 
1254   for (uint i = 0; i < total_args_passed; i++) {
1255     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1256     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1257     regs[i].set_pair(vmreg->next(next_val), vmreg);
1258   }
1259 
1260   return stk_args;
1261 }
1262 
1263 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1264   // We always ignore the frame_slots arg and just use the space just below frame pointer
1265   // which by this time is free to use
1266   switch (ret_type) {
1267   case T_FLOAT:
1268     __ movflt(Address(rbp, -wordSize), xmm0);
1269     break;
1270   case T_DOUBLE:
1271     __ movdbl(Address(rbp, -wordSize), xmm0);
1272     break;
1273   case T_VOID:  break;
1274   default: {
1275     __ movptr(Address(rbp, -wordSize), rax);
1276     }
1277   }
1278 }
1279 
1280 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1281   // We always ignore the frame_slots arg and just use the space just below frame pointer
1282   // which by this time is free to use
1283   switch (ret_type) {
1284   case T_FLOAT:
1285     __ movflt(xmm0, Address(rbp, -wordSize));
1286     break;
1287   case T_DOUBLE:
1288     __ movdbl(xmm0, Address(rbp, -wordSize));
1289     break;
1290   case T_VOID:  break;
1291   default: {
1292     __ movptr(rax, Address(rbp, -wordSize));
1293     }
1294   }
1295 }
1296 
1297 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1298     for ( int i = first_arg ; i < arg_count ; i++ ) {
1299       if (args[i].first()->is_Register()) {
1300         __ push(args[i].first()->as_Register());
1301       } else if (args[i].first()->is_XMMRegister()) {
1302         __ subptr(rsp, 2*wordSize);
1303         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1304       }
1305     }
1306 }
1307 
1308 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1309     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1310       if (args[i].first()->is_Register()) {
1311         __ pop(args[i].first()->as_Register());
1312       } else if (args[i].first()->is_XMMRegister()) {
1313         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1314         __ addptr(rsp, 2*wordSize);
1315       }
1316     }
1317 }
1318 
1319 static void verify_oop_args(MacroAssembler* masm,
1320                             const methodHandle& method,
1321                             const BasicType* sig_bt,
1322                             const VMRegPair* regs) {
1323   Register temp_reg = rbx;  // not part of any compiled calling seq
1324   if (VerifyOops) {
1325     for (int i = 0; i < method->size_of_parameters(); i++) {
1326       if (is_reference_type(sig_bt[i])) {
1327         VMReg r = regs[i].first();
1328         assert(r->is_valid(), "bad oop arg");
1329         if (r->is_stack()) {
1330           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1331           __ verify_oop(temp_reg);
1332         } else {
1333           __ verify_oop(r->as_Register());
1334         }
1335       }
1336     }
1337   }
1338 }
1339 
1340 static void check_continuation_enter_argument(VMReg actual_vmreg,
1341                                               Register expected_reg,
1342                                               const char* name) {
1343   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1344   assert(actual_vmreg->as_Register() == expected_reg,
1345          "%s is in unexpected register: %s instead of %s",
1346          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1347 }
1348 
1349 
1350 //---------------------------- continuation_enter_setup ---------------------------
1351 //
1352 // Arguments:
1353 //   None.
1354 //
1355 // Results:
1356 //   rsp: pointer to blank ContinuationEntry
1357 //
1358 // Kills:
1359 //   rax
1360 //
1361 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1362   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1363   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1364   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1365 
1366   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1367   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1368 
1369   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1370   OopMap* map = new OopMap(frame_size, 0);
1371 
1372   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1373   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1374   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1375 
1376   return map;
1377 }
1378 
1379 //---------------------------- fill_continuation_entry ---------------------------
1380 //
1381 // Arguments:
1382 //   rsp: pointer to blank Continuation entry
1383 //   reg_cont_obj: pointer to the continuation
1384 //   reg_flags: flags
1385 //
1386 // Results:
1387 //   rsp: pointer to filled out ContinuationEntry
1388 //
1389 // Kills:
1390 //   rax
1391 //
1392 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1393   assert_different_registers(rax, reg_cont_obj, reg_flags);
1394 #ifdef ASSERT
1395   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1396 #endif
1397   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1398   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1399   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1400   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1401   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1402 
1403   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1404   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1405   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1406   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1407 
1408   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1409   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1410 }
1411 
1412 //---------------------------- continuation_enter_cleanup ---------------------------
1413 //
1414 // Arguments:
1415 //   rsp: pointer to the ContinuationEntry
1416 //
1417 // Results:
1418 //   rsp: pointer to the spilled rbp in the entry frame
1419 //
1420 // Kills:
1421 //   rbx
1422 //
1423 void static continuation_enter_cleanup(MacroAssembler* masm) {
1424 #ifdef ASSERT
1425   Label L_good_sp;
1426   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1427   __ jcc(Assembler::equal, L_good_sp);
1428   __ stop("Incorrect rsp at continuation_enter_cleanup");
1429   __ bind(L_good_sp);
1430 #endif
1431   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1432   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1433 
1434   if (CheckJNICalls) {
1435     // Check if this is a virtual thread continuation
1436     Label L_skip_vthread_code;
1437     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1438     __ jcc(Assembler::equal, L_skip_vthread_code);
1439 
1440     // If the held monitor count is > 0 and this vthread is terminating then
1441     // it failed to release a JNI monitor. So we issue the same log message
1442     // that JavaThread::exit does.
1443     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1444     __ jcc(Assembler::equal, L_skip_vthread_code);
1445 
1446     // rax may hold an exception oop, save it before the call
1447     __ push(rax);
1448     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1449     __ pop(rax);
1450 
1451     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1452     // on termination. The held count is implicitly zeroed below when we restore from
1453     // the parent held count (which has to be zero).
1454     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1455 
1456     __ bind(L_skip_vthread_code);
1457   }
1458 #ifdef ASSERT
1459   else {
1460     // Check if this is a virtual thread continuation
1461     Label L_skip_vthread_code;
1462     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1463     __ jcc(Assembler::equal, L_skip_vthread_code);
1464 
1465     // See comment just above. If not checking JNI calls the JNI count is only
1466     // needed for assertion checking.
1467     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1468 
1469     __ bind(L_skip_vthread_code);
1470   }
1471 #endif
1472 
1473   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1474   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1475 
1476   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1477   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1478   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1479 }
1480 
1481 static void gen_continuation_enter(MacroAssembler* masm,
1482                                    const VMRegPair* regs,
1483                                    int& exception_offset,
1484                                    OopMapSet* oop_maps,
1485                                    int& frame_complete,
1486                                    int& stack_slots,
1487                                    int& interpreted_entry_offset,
1488                                    int& compiled_entry_offset) {
1489 
1490   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1491   int pos_cont_obj   = 0;
1492   int pos_is_cont    = 1;
1493   int pos_is_virtual = 2;
1494 
1495   // The platform-specific calling convention may present the arguments in various registers.
1496   // To simplify the rest of the code, we expect the arguments to reside at these known
1497   // registers, and we additionally check the placement here in case calling convention ever
1498   // changes.
1499   Register reg_cont_obj   = c_rarg1;
1500   Register reg_is_cont    = c_rarg2;
1501   Register reg_is_virtual = c_rarg3;
1502 
1503   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1504   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1505   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1506 
1507   // Utility methods kill rax, make sure there are no collisions
1508   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1509 
1510   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1511                          relocInfo::static_call_type);
1512 
1513   address start = __ pc();
1514 
1515   Label L_thaw, L_exit;
1516 
1517   // i2i entry used at interp_only_mode only
1518   interpreted_entry_offset = __ pc() - start;
1519   {
1520 #ifdef ASSERT
1521     Label is_interp_only;
1522     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1523     __ jcc(Assembler::notEqual, is_interp_only);
1524     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1525     __ bind(is_interp_only);
1526 #endif
1527 
1528     __ pop(rax); // return address
1529     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1530     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1531     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1532     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1533     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1534     __ push(rax); // return address
1535     __ push_cont_fastpath();
1536 
1537     __ enter();
1538 
1539     stack_slots = 2; // will be adjusted in setup
1540     OopMap* map = continuation_enter_setup(masm, stack_slots);
1541     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1542     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1543 
1544     __ verify_oop(reg_cont_obj);
1545 
1546     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1547 
1548     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1549     __ testptr(reg_is_cont, reg_is_cont);
1550     __ jcc(Assembler::notZero, L_thaw);
1551 
1552     // --- Resolve path
1553 
1554     // Make sure the call is patchable
1555     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1556     // Emit stub for static call
1557     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1558     if (stub == nullptr) {
1559       fatal("CodeCache is full at gen_continuation_enter");
1560     }
1561     __ call(resolve);
1562     oop_maps->add_gc_map(__ pc() - start, map);
1563     __ post_call_nop();
1564 
1565     __ jmp(L_exit);
1566   }
1567 
1568   // compiled entry
1569   __ align(CodeEntryAlignment);
1570   compiled_entry_offset = __ pc() - start;
1571   __ enter();
1572 
1573   stack_slots = 2; // will be adjusted in setup
1574   OopMap* map = continuation_enter_setup(masm, stack_slots);
1575 
1576   // Frame is now completed as far as size and linkage.
1577   frame_complete = __ pc() - start;
1578 
1579   __ verify_oop(reg_cont_obj);
1580 
1581   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1582 
1583   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1584   __ testptr(reg_is_cont, reg_is_cont);
1585   __ jccb(Assembler::notZero, L_thaw);
1586 
1587   // --- call Continuation.enter(Continuation c, boolean isContinue)
1588 
1589   // Make sure the call is patchable
1590   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1591 
1592   // Emit stub for static call
1593   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1594   if (stub == nullptr) {
1595     fatal("CodeCache is full at gen_continuation_enter");
1596   }
1597 
1598   // The call needs to be resolved. There's a special case for this in
1599   // SharedRuntime::find_callee_info_helper() which calls
1600   // LinkResolver::resolve_continuation_enter() which resolves the call to
1601   // Continuation.enter(Continuation c, boolean isContinue).
1602   __ call(resolve);
1603 
1604   oop_maps->add_gc_map(__ pc() - start, map);
1605   __ post_call_nop();
1606 
1607   __ jmpb(L_exit);
1608 
1609   // --- Thawing path
1610 
1611   __ bind(L_thaw);
1612 
1613   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1614 
1615   ContinuationEntry::_return_pc_offset = __ pc() - start;
1616   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1617   __ post_call_nop();
1618 
1619   // --- Normal exit (resolve/thawing)
1620 
1621   __ bind(L_exit);
1622 
1623   continuation_enter_cleanup(masm);
1624   __ pop(rbp);
1625   __ ret(0);
1626 
1627   // --- Exception handling path
1628 
1629   exception_offset = __ pc() - start;
1630 
1631   continuation_enter_cleanup(masm);
1632   __ pop(rbp);
1633 
1634   __ movptr(c_rarg0, r15_thread);
1635   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1636 
1637   // rax still holds the original exception oop, save it before the call
1638   __ push(rax);
1639 
1640   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1641   __ movptr(rbx, rax);
1642 
1643   // Continue at exception handler:
1644   //   rax: exception oop
1645   //   rbx: exception handler
1646   //   rdx: exception pc
1647   __ pop(rax);
1648   __ verify_oop(rax);
1649   __ pop(rdx);
1650   __ jmp(rbx);
1651 }
1652 
1653 static void gen_continuation_yield(MacroAssembler* masm,
1654                                    const VMRegPair* regs,
1655                                    OopMapSet* oop_maps,
1656                                    int& frame_complete,
1657                                    int& stack_slots,
1658                                    int& compiled_entry_offset) {
1659   enum layout {
1660     rbp_off,
1661     rbpH_off,
1662     return_off,
1663     return_off2,
1664     framesize // inclusive of return address
1665   };
1666   stack_slots = framesize /  VMRegImpl::slots_per_word;
1667   assert(stack_slots == 2, "recheck layout");
1668 
1669   address start = __ pc();
1670   compiled_entry_offset = __ pc() - start;
1671   __ enter();
1672   address the_pc = __ pc();
1673 
1674   frame_complete = the_pc - start;
1675 
1676   // This nop must be exactly at the PC we push into the frame info.
1677   // We use this nop for fast CodeBlob lookup, associate the OopMap
1678   // with it right away.
1679   __ post_call_nop();
1680   OopMap* map = new OopMap(framesize, 1);
1681   oop_maps->add_gc_map(frame_complete, map);
1682 
1683   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1684   __ movptr(c_rarg0, r15_thread);
1685   __ movptr(c_rarg1, rsp);
1686   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1687   __ reset_last_Java_frame(true);
1688 
1689   Label L_pinned;
1690 
1691   __ testptr(rax, rax);
1692   __ jcc(Assembler::notZero, L_pinned);
1693 
1694   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1695   continuation_enter_cleanup(masm);
1696   __ pop(rbp);
1697   __ ret(0);
1698 
1699   __ bind(L_pinned);
1700 
1701   // Pinned, return to caller
1702 
1703   // handle pending exception thrown by freeze
1704   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1705   Label ok;
1706   __ jcc(Assembler::equal, ok);
1707   __ leave();
1708   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1709   __ bind(ok);
1710 
1711   __ leave();
1712   __ ret(0);
1713 }
1714 
1715 static void gen_special_dispatch(MacroAssembler* masm,
1716                                  const methodHandle& method,
1717                                  const BasicType* sig_bt,
1718                                  const VMRegPair* regs) {
1719   verify_oop_args(masm, method, sig_bt, regs);
1720   vmIntrinsics::ID iid = method->intrinsic_id();
1721 
1722   // Now write the args into the outgoing interpreter space
1723   bool     has_receiver   = false;
1724   Register receiver_reg   = noreg;
1725   int      member_arg_pos = -1;
1726   Register member_reg     = noreg;
1727   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1728   if (ref_kind != 0) {
1729     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1730     member_reg = rbx;  // known to be free at this point
1731     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1732   } else if (iid == vmIntrinsics::_invokeBasic) {
1733     has_receiver = true;
1734   } else if (iid == vmIntrinsics::_linkToNative) {
1735     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1736     member_reg = rbx;  // known to be free at this point
1737   } else {
1738     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1739   }
1740 
1741   if (member_reg != noreg) {
1742     // Load the member_arg into register, if necessary.
1743     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1744     VMReg r = regs[member_arg_pos].first();
1745     if (r->is_stack()) {
1746       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1747     } else {
1748       // no data motion is needed
1749       member_reg = r->as_Register();
1750     }
1751   }
1752 
1753   if (has_receiver) {
1754     // Make sure the receiver is loaded into a register.
1755     assert(method->size_of_parameters() > 0, "oob");
1756     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1757     VMReg r = regs[0].first();
1758     assert(r->is_valid(), "bad receiver arg");
1759     if (r->is_stack()) {
1760       // Porting note:  This assumes that compiled calling conventions always
1761       // pass the receiver oop in a register.  If this is not true on some
1762       // platform, pick a temp and load the receiver from stack.
1763       fatal("receiver always in a register");
1764       receiver_reg = j_rarg0;  // known to be free at this point
1765       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1766     } else {
1767       // no data motion is needed
1768       receiver_reg = r->as_Register();
1769     }
1770   }
1771 
1772   // Figure out which address we are really jumping to:
1773   MethodHandles::generate_method_handle_dispatch(masm, iid,
1774                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1775 }
1776 
1777 // ---------------------------------------------------------------------------
1778 // Generate a native wrapper for a given method.  The method takes arguments
1779 // in the Java compiled code convention, marshals them to the native
1780 // convention (handlizes oops, etc), transitions to native, makes the call,
1781 // returns to java state (possibly blocking), unhandlizes any result and
1782 // returns.
1783 //
1784 // Critical native functions are a shorthand for the use of
1785 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1786 // functions.  The wrapper is expected to unpack the arguments before
1787 // passing them to the callee. Critical native functions leave the state _in_Java,
1788 // since they cannot stop for GC.
1789 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1790 // block and the check for pending exceptions it's impossible for them
1791 // to be thrown.
1792 //
1793 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1794                                                 const methodHandle& method,
1795                                                 int compile_id,
1796                                                 BasicType* in_sig_bt,
1797                                                 VMRegPair* in_regs,
1798                                                 BasicType ret_type) {
1799   if (method->is_continuation_native_intrinsic()) {
1800     int exception_offset = -1;
1801     OopMapSet* oop_maps = new OopMapSet();
1802     int frame_complete = -1;
1803     int stack_slots = -1;
1804     int interpreted_entry_offset = -1;
1805     int vep_offset = -1;
1806     if (method->is_continuation_enter_intrinsic()) {
1807       gen_continuation_enter(masm,
1808                              in_regs,
1809                              exception_offset,
1810                              oop_maps,
1811                              frame_complete,
1812                              stack_slots,
1813                              interpreted_entry_offset,
1814                              vep_offset);
1815     } else if (method->is_continuation_yield_intrinsic()) {
1816       gen_continuation_yield(masm,
1817                              in_regs,
1818                              oop_maps,
1819                              frame_complete,
1820                              stack_slots,
1821                              vep_offset);
1822     } else {
1823       guarantee(false, "Unknown Continuation native intrinsic");
1824     }
1825 
1826 #ifdef ASSERT
1827     if (method->is_continuation_enter_intrinsic()) {
1828       assert(interpreted_entry_offset != -1, "Must be set");
1829       assert(exception_offset != -1,         "Must be set");
1830     } else {
1831       assert(interpreted_entry_offset == -1, "Must be unset");
1832       assert(exception_offset == -1,         "Must be unset");
1833     }
1834     assert(frame_complete != -1,    "Must be set");
1835     assert(stack_slots != -1,       "Must be set");
1836     assert(vep_offset != -1,        "Must be set");
1837 #endif
1838 
1839     __ flush();
1840     nmethod* nm = nmethod::new_native_nmethod(method,
1841                                               compile_id,
1842                                               masm->code(),
1843                                               vep_offset,
1844                                               frame_complete,
1845                                               stack_slots,
1846                                               in_ByteSize(-1),
1847                                               in_ByteSize(-1),
1848                                               oop_maps,
1849                                               exception_offset);
1850     if (nm == nullptr) return nm;
1851     if (method->is_continuation_enter_intrinsic()) {
1852       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1853     } else if (method->is_continuation_yield_intrinsic()) {
1854       _cont_doYield_stub = nm;
1855     }
1856     return nm;
1857   }
1858 
1859   if (method->is_method_handle_intrinsic()) {
1860     vmIntrinsics::ID iid = method->intrinsic_id();
1861     intptr_t start = (intptr_t)__ pc();
1862     int vep_offset = ((intptr_t)__ pc()) - start;
1863     gen_special_dispatch(masm,
1864                          method,
1865                          in_sig_bt,
1866                          in_regs);
1867     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1868     __ flush();
1869     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1870     return nmethod::new_native_nmethod(method,
1871                                        compile_id,
1872                                        masm->code(),
1873                                        vep_offset,
1874                                        frame_complete,
1875                                        stack_slots / VMRegImpl::slots_per_word,
1876                                        in_ByteSize(-1),
1877                                        in_ByteSize(-1),
1878                                        nullptr);
1879   }
1880   address native_func = method->native_function();
1881   assert(native_func != nullptr, "must have function");
1882 
1883   // An OopMap for lock (and class if static)
1884   OopMapSet *oop_maps = new OopMapSet();
1885   intptr_t start = (intptr_t)__ pc();
1886 
1887   // We have received a description of where all the java arg are located
1888   // on entry to the wrapper. We need to convert these args to where
1889   // the jni function will expect them. To figure out where they go
1890   // we convert the java signature to a C signature by inserting
1891   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1892 
1893   const int total_in_args = method->size_of_parameters();
1894   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1895 
1896   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1897   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1898   BasicType* in_elem_bt = nullptr;
1899 
1900   int argc = 0;
1901   out_sig_bt[argc++] = T_ADDRESS;
1902   if (method->is_static()) {
1903     out_sig_bt[argc++] = T_OBJECT;
1904   }
1905 
1906   for (int i = 0; i < total_in_args ; i++ ) {
1907     out_sig_bt[argc++] = in_sig_bt[i];
1908   }
1909 
1910   // Now figure out where the args must be stored and how much stack space
1911   // they require.
1912   int out_arg_slots;
1913   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1914 
1915   // Compute framesize for the wrapper.  We need to handlize all oops in
1916   // incoming registers
1917 
1918   // Calculate the total number of stack slots we will need.
1919 
1920   // First count the abi requirement plus all of the outgoing args
1921   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1922 
1923   // Now the space for the inbound oop handle area
1924   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1925 
1926   int oop_handle_offset = stack_slots;
1927   stack_slots += total_save_slots;
1928 
1929   // Now any space we need for handlizing a klass if static method
1930 
1931   int klass_slot_offset = 0;
1932   int klass_offset = -1;
1933   int lock_slot_offset = 0;
1934   bool is_static = false;
1935 
1936   if (method->is_static()) {
1937     klass_slot_offset = stack_slots;
1938     stack_slots += VMRegImpl::slots_per_word;
1939     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1940     is_static = true;
1941   }
1942 
1943   // Plus a lock if needed
1944 
1945   if (method->is_synchronized()) {
1946     lock_slot_offset = stack_slots;
1947     stack_slots += VMRegImpl::slots_per_word;
1948   }
1949 
1950   // Now a place (+2) to save return values or temp during shuffling
1951   // + 4 for return address (which we own) and saved rbp
1952   stack_slots += 6;
1953 
1954   // Ok The space we have allocated will look like:
1955   //
1956   //
1957   // FP-> |                     |
1958   //      |---------------------|
1959   //      | 2 slots for moves   |
1960   //      |---------------------|
1961   //      | lock box (if sync)  |
1962   //      |---------------------| <- lock_slot_offset
1963   //      | klass (if static)   |
1964   //      |---------------------| <- klass_slot_offset
1965   //      | oopHandle area      |
1966   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1967   //      | outbound memory     |
1968   //      | based arguments     |
1969   //      |                     |
1970   //      |---------------------|
1971   //      |                     |
1972   // SP-> | out_preserved_slots |
1973   //
1974   //
1975 
1976 
1977   // Now compute actual number of stack words we need rounding to make
1978   // stack properly aligned.
1979   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1980 
1981   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1982 
1983   // First thing make an ic check to see if we should even be here
1984 
1985   // We are free to use all registers as temps without saving them and
1986   // restoring them except rbp. rbp is the only callee save register
1987   // as far as the interpreter and the compiler(s) are concerned.
1988 
1989   const Register receiver = j_rarg0;
1990 
1991   Label exception_pending;
1992 
1993   assert_different_registers(receiver, rscratch1, rscratch2);
1994   __ verify_oop(receiver);
1995   __ ic_check(8 /* end_alignment */);
1996 
1997   int vep_offset = ((intptr_t)__ pc()) - start;
1998 
1999   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2000     Label L_skip_barrier;
2001     Register klass = r10;
2002     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2003     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2004 
2005     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2006 
2007     __ bind(L_skip_barrier);
2008   }
2009 
2010 #ifdef COMPILER1
2011   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2012   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2013     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2014   }
2015 #endif // COMPILER1
2016 
2017   // The instruction at the verified entry point must be 5 bytes or longer
2018   // because it can be patched on the fly by make_non_entrant. The stack bang
2019   // instruction fits that requirement.
2020 
2021   // Generate stack overflow check
2022   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2023 
2024   // Generate a new frame for the wrapper.
2025   __ enter();
2026   // -2 because return address is already present and so is saved rbp
2027   __ subptr(rsp, stack_size - 2*wordSize);
2028 
2029   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2030   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2031   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2032 
2033   // Frame is now completed as far as size and linkage.
2034   int frame_complete = ((intptr_t)__ pc()) - start;
2035 
2036 #ifdef ASSERT
2037   __ check_stack_alignment(rsp, "improperly aligned stack");
2038 #endif /* ASSERT */
2039 
2040 
2041   // We use r14 as the oop handle for the receiver/klass
2042   // It is callee save so it survives the call to native
2043 
2044   const Register oop_handle_reg = r14;
2045 
2046   //
2047   // We immediately shuffle the arguments so that any vm call we have to
2048   // make from here on out (sync slow path, jvmti, etc.) we will have
2049   // captured the oops from our caller and have a valid oopMap for
2050   // them.
2051 
2052   // -----------------
2053   // The Grand Shuffle
2054 
2055   // The Java calling convention is either equal (linux) or denser (win64) than the
2056   // c calling convention. However the because of the jni_env argument the c calling
2057   // convention always has at least one more (and two for static) arguments than Java.
2058   // Therefore if we move the args from java -> c backwards then we will never have
2059   // a register->register conflict and we don't have to build a dependency graph
2060   // and figure out how to break any cycles.
2061   //
2062 
2063   // Record esp-based slot for receiver on stack for non-static methods
2064   int receiver_offset = -1;
2065 
2066   // This is a trick. We double the stack slots so we can claim
2067   // the oops in the caller's frame. Since we are sure to have
2068   // more args than the caller doubling is enough to make
2069   // sure we can capture all the incoming oop args from the
2070   // caller.
2071   //
2072   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2073 
2074   // Mark location of rbp (someday)
2075   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2076 
2077   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2078   // All inbound args are referenced based on rbp and all outbound args via rsp.
2079 
2080 
2081 #ifdef ASSERT
2082   bool reg_destroyed[Register::number_of_registers];
2083   bool freg_destroyed[XMMRegister::number_of_registers];
2084   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2085     reg_destroyed[r] = false;
2086   }
2087   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2088     freg_destroyed[f] = false;
2089   }
2090 
2091 #endif /* ASSERT */
2092 
2093   // For JNI natives the incoming and outgoing registers are offset upwards.
2094   GrowableArray<int> arg_order(2 * total_in_args);
2095 
2096   VMRegPair tmp_vmreg;
2097   tmp_vmreg.set2(rbx->as_VMReg());
2098 
2099   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2100     arg_order.push(i);
2101     arg_order.push(c_arg);
2102   }
2103 
2104   int temploc = -1;
2105   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2106     int i = arg_order.at(ai);
2107     int c_arg = arg_order.at(ai + 1);
2108     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2109 #ifdef ASSERT
2110     if (in_regs[i].first()->is_Register()) {
2111       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2112     } else if (in_regs[i].first()->is_XMMRegister()) {
2113       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2114     }
2115     if (out_regs[c_arg].first()->is_Register()) {
2116       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2117     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2118       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2119     }
2120 #endif /* ASSERT */
2121     switch (in_sig_bt[i]) {
2122       case T_ARRAY:
2123       case T_OBJECT:
2124         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2125                     ((i == 0) && (!is_static)),
2126                     &receiver_offset);
2127         break;
2128       case T_VOID:
2129         break;
2130 
2131       case T_FLOAT:
2132         __ float_move(in_regs[i], out_regs[c_arg]);
2133           break;
2134 
2135       case T_DOUBLE:
2136         assert( i + 1 < total_in_args &&
2137                 in_sig_bt[i + 1] == T_VOID &&
2138                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2139         __ double_move(in_regs[i], out_regs[c_arg]);
2140         break;
2141 
2142       case T_LONG :
2143         __ long_move(in_regs[i], out_regs[c_arg]);
2144         break;
2145 
2146       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2147 
2148       default:
2149         __ move32_64(in_regs[i], out_regs[c_arg]);
2150     }
2151   }
2152 
2153   int c_arg;
2154 
2155   // Pre-load a static method's oop into r14.  Used both by locking code and
2156   // the normal JNI call code.
2157   // point c_arg at the first arg that is already loaded in case we
2158   // need to spill before we call out
2159   c_arg = total_c_args - total_in_args;
2160 
2161   if (method->is_static()) {
2162 
2163     //  load oop into a register
2164     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2165 
2166     // Now handlize the static class mirror it's known not-null.
2167     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2168     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2169 
2170     // Now get the handle
2171     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2172     // store the klass handle as second argument
2173     __ movptr(c_rarg1, oop_handle_reg);
2174     // and protect the arg if we must spill
2175     c_arg--;
2176   }
2177 
2178   // Change state to native (we save the return address in the thread, since it might not
2179   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2180   // points into the right code segment. It does not have to be the correct return pc.
2181   // We use the same pc/oopMap repeatedly when we call out
2182 
2183   intptr_t the_pc = (intptr_t) __ pc();
2184   oop_maps->add_gc_map(the_pc - start, map);
2185 
2186   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2187 
2188 
2189   // We have all of the arguments setup at this point. We must not touch any register
2190   // argument registers at this point (what if we save/restore them there are no oop?
2191 
2192   if (DTraceMethodProbes) {
2193     // protect the args we've loaded
2194     save_args(masm, total_c_args, c_arg, out_regs);
2195     __ mov_metadata(c_rarg1, method());
2196     __ call_VM_leaf(
2197       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2198       r15_thread, c_rarg1);
2199     restore_args(masm, total_c_args, c_arg, out_regs);
2200   }
2201 
2202   // RedefineClasses() tracing support for obsolete method entry
2203   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2204     // protect the args we've loaded
2205     save_args(masm, total_c_args, c_arg, out_regs);
2206     __ mov_metadata(c_rarg1, method());
2207     __ call_VM_leaf(
2208       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2209       r15_thread, c_rarg1);
2210     restore_args(masm, total_c_args, c_arg, out_regs);
2211   }
2212 
2213   // Lock a synchronized method
2214 
2215   // Register definitions used by locking and unlocking
2216 
2217   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2218   const Register obj_reg  = rbx;  // Will contain the oop
2219   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2220   const Register old_hdr  = r13;  // value of old header at unlock time
2221 
2222   Label slow_path_lock;
2223   Label lock_done;
2224 
2225   if (method->is_synchronized()) {
2226     Label count_mon;
2227 
2228     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2229 
2230     // Get the handle (the 2nd argument)
2231     __ mov(oop_handle_reg, c_rarg1);
2232 
2233     // Get address of the box
2234 
2235     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2236 
2237     // Load the oop from the handle
2238     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2239 
2240     if (LockingMode == LM_MONITOR) {
2241       __ jmp(slow_path_lock);
2242     } else if (LockingMode == LM_LEGACY) {
2243       // Load immediate 1 into swap_reg %rax
2244       __ movl(swap_reg, 1);
2245 
2246       // Load (object->mark() | 1) into swap_reg %rax
2247       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2248 
2249       // Save (object->mark() | 1) into BasicLock's displaced header
2250       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2251 
2252       // src -> dest iff dest == rax else rax <- dest
2253       __ lock();
2254       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2255       __ jcc(Assembler::equal, count_mon);
2256 
2257       // Hmm should this move to the slow path code area???
2258 
2259       // Test if the oopMark is an obvious stack pointer, i.e.,
2260       //  1) (mark & 3) == 0, and
2261       //  2) rsp <= mark < mark + os::pagesize()
2262       // These 3 tests can be done by evaluating the following
2263       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2264       // assuming both stack pointer and pagesize have their
2265       // least significant 2 bits clear.
2266       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2267 
2268       __ subptr(swap_reg, rsp);
2269       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2270 
2271       // Save the test result, for recursive case, the result is zero
2272       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2273       __ jcc(Assembler::notEqual, slow_path_lock);
2274     } else {
2275       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2276       __ lightweight_lock(lock_reg, obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2277     }
2278     __ bind(count_mon);
2279     __ inc_held_monitor_count();
2280 
2281     // Slow path will re-enter here
2282     __ bind(lock_done);
2283   }
2284 
2285   // Finally just about ready to make the JNI call
2286 
2287   // get JNIEnv* which is first argument to native
2288   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2289 
2290   // Now set thread in native
2291   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2292 
2293   __ call(RuntimeAddress(native_func));
2294 
2295   // Verify or restore cpu control state after JNI call
2296   __ restore_cpu_control_state_after_jni(rscratch1);
2297 
2298   // Unpack native results.
2299   switch (ret_type) {
2300   case T_BOOLEAN: __ c2bool(rax);            break;
2301   case T_CHAR   : __ movzwl(rax, rax);      break;
2302   case T_BYTE   : __ sign_extend_byte (rax); break;
2303   case T_SHORT  : __ sign_extend_short(rax); break;
2304   case T_INT    : /* nothing to do */        break;
2305   case T_DOUBLE :
2306   case T_FLOAT  :
2307     // Result is in xmm0 we'll save as needed
2308     break;
2309   case T_ARRAY:                 // Really a handle
2310   case T_OBJECT:                // Really a handle
2311       break; // can't de-handlize until after safepoint check
2312   case T_VOID: break;
2313   case T_LONG: break;
2314   default       : ShouldNotReachHere();
2315   }
2316 
2317   Label after_transition;
2318 
2319   // Switch thread to "native transition" state before reading the synchronization state.
2320   // This additional state is necessary because reading and testing the synchronization
2321   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2322   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2323   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2324   //     Thread A is resumed to finish this native method, but doesn't block here since it
2325   //     didn't see any synchronization is progress, and escapes.
2326   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2327 
2328   // Force this write out before the read below
2329   if (!UseSystemMemoryBarrier) {
2330     __ membar(Assembler::Membar_mask_bits(
2331               Assembler::LoadLoad | Assembler::LoadStore |
2332               Assembler::StoreLoad | Assembler::StoreStore));
2333   }
2334 
2335   // check for safepoint operation in progress and/or pending suspend requests
2336   {
2337     Label Continue;
2338     Label slow_path;
2339 
2340     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2341 
2342     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2343     __ jcc(Assembler::equal, Continue);
2344     __ bind(slow_path);
2345 
2346     // Don't use call_VM as it will see a possible pending exception and forward it
2347     // and never return here preventing us from clearing _last_native_pc down below.
2348     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2349     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2350     // by hand.
2351     //
2352     __ vzeroupper();
2353     save_native_result(masm, ret_type, stack_slots);
2354     __ mov(c_rarg0, r15_thread);
2355     __ mov(r12, rsp); // remember sp
2356     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2357     __ andptr(rsp, -16); // align stack as required by ABI
2358     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2359     __ mov(rsp, r12); // restore sp
2360     __ reinit_heapbase();
2361     // Restore any method result value
2362     restore_native_result(masm, ret_type, stack_slots);
2363     __ bind(Continue);
2364   }
2365 
2366   // change thread state
2367   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2368   __ bind(after_transition);
2369 
2370   Label reguard;
2371   Label reguard_done;
2372   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2373   __ jcc(Assembler::equal, reguard);
2374   __ bind(reguard_done);
2375 
2376   // native result if any is live
2377 
2378   // Unlock
2379   Label slow_path_unlock;
2380   Label unlock_done;
2381   if (method->is_synchronized()) {
2382 
2383     Label fast_done;
2384 
2385     // Get locked oop from the handle we passed to jni
2386     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2387 
2388     if (LockingMode == LM_LEGACY) {
2389       Label not_recur;
2390       // Simple recursive lock?
2391       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2392       __ jcc(Assembler::notEqual, not_recur);
2393       __ dec_held_monitor_count();
2394       __ jmpb(fast_done);
2395       __ bind(not_recur);
2396     }
2397 
2398     // Must save rax if it is live now because cmpxchg must use it
2399     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2400       save_native_result(masm, ret_type, stack_slots);
2401     }
2402 
2403     if (LockingMode == LM_MONITOR) {
2404       __ jmp(slow_path_unlock);
2405     } else if (LockingMode == LM_LEGACY) {
2406       // get address of the stack lock
2407       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2408       //  get old displaced header
2409       __ movptr(old_hdr, Address(rax, 0));
2410 
2411       // Atomic swap old header if oop still contains the stack lock
2412       __ lock();
2413       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2414       __ jcc(Assembler::notEqual, slow_path_unlock);
2415       __ dec_held_monitor_count();
2416     } else {
2417       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2418       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2419       __ dec_held_monitor_count();
2420     }
2421 
2422     // slow path re-enters here
2423     __ bind(unlock_done);
2424     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2425       restore_native_result(masm, ret_type, stack_slots);
2426     }
2427 
2428     __ bind(fast_done);
2429   }
2430   if (DTraceMethodProbes) {
2431     save_native_result(masm, ret_type, stack_slots);
2432     __ mov_metadata(c_rarg1, method());
2433     __ call_VM_leaf(
2434          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2435          r15_thread, c_rarg1);
2436     restore_native_result(masm, ret_type, stack_slots);
2437   }
2438 
2439   __ reset_last_Java_frame(false);
2440 
2441   // Unbox oop result, e.g. JNIHandles::resolve value.
2442   if (is_reference_type(ret_type)) {
2443     __ resolve_jobject(rax /* value */,
2444                        r15_thread /* thread */,
2445                        rcx /* tmp */);
2446   }
2447 
2448   if (CheckJNICalls) {
2449     // clear_pending_jni_exception_check
2450     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2451   }
2452 
2453   // reset handle block
2454   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2455   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2456 
2457   // pop our frame
2458 
2459   __ leave();
2460 
2461   // Any exception pending?
2462   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2463   __ jcc(Assembler::notEqual, exception_pending);
2464 
2465   // Return
2466 
2467   __ ret(0);
2468 
2469   // Unexpected paths are out of line and go here
2470 
2471   // forward the exception
2472   __ bind(exception_pending);
2473 
2474   // and forward the exception
2475   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2476 
2477   // Slow path locking & unlocking
2478   if (method->is_synchronized()) {
2479 
2480     // BEGIN Slow path lock
2481     __ bind(slow_path_lock);
2482 
2483     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2484     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2485 
2486     // protect the args we've loaded
2487     save_args(masm, total_c_args, c_arg, out_regs);
2488 
2489     __ mov(c_rarg0, obj_reg);
2490     __ mov(c_rarg1, lock_reg);
2491     __ mov(c_rarg2, r15_thread);
2492 
2493     // Not a leaf but we have last_Java_frame setup as we want
2494     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2495     restore_args(masm, total_c_args, c_arg, out_regs);
2496 
2497 #ifdef ASSERT
2498     { Label L;
2499     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2500     __ jcc(Assembler::equal, L);
2501     __ stop("no pending exception allowed on exit from monitorenter");
2502     __ bind(L);
2503     }
2504 #endif
2505     __ jmp(lock_done);
2506 
2507     // END Slow path lock
2508 
2509     // BEGIN Slow path unlock
2510     __ bind(slow_path_unlock);
2511 
2512     // If we haven't already saved the native result we must save it now as xmm registers
2513     // are still exposed.
2514     __ vzeroupper();
2515     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2516       save_native_result(masm, ret_type, stack_slots);
2517     }
2518 
2519     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2520 
2521     __ mov(c_rarg0, obj_reg);
2522     __ mov(c_rarg2, r15_thread);
2523     __ mov(r12, rsp); // remember sp
2524     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2525     __ andptr(rsp, -16); // align stack as required by ABI
2526 
2527     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2528     // NOTE that obj_reg == rbx currently
2529     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2530     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2531 
2532     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2533     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2534     __ mov(rsp, r12); // restore sp
2535     __ reinit_heapbase();
2536 #ifdef ASSERT
2537     {
2538       Label L;
2539       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2540       __ jcc(Assembler::equal, L);
2541       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2542       __ bind(L);
2543     }
2544 #endif /* ASSERT */
2545 
2546     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2547 
2548     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2549       restore_native_result(masm, ret_type, stack_slots);
2550     }
2551     __ jmp(unlock_done);
2552 
2553     // END Slow path unlock
2554 
2555   } // synchronized
2556 
2557   // SLOW PATH Reguard the stack if needed
2558 
2559   __ bind(reguard);
2560   __ vzeroupper();
2561   save_native_result(masm, ret_type, stack_slots);
2562   __ mov(r12, rsp); // remember sp
2563   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2564   __ andptr(rsp, -16); // align stack as required by ABI
2565   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2566   __ mov(rsp, r12); // restore sp
2567   __ reinit_heapbase();
2568   restore_native_result(masm, ret_type, stack_slots);
2569   // and continue
2570   __ jmp(reguard_done);
2571 
2572 
2573 
2574   __ flush();
2575 
2576   nmethod *nm = nmethod::new_native_nmethod(method,
2577                                             compile_id,
2578                                             masm->code(),
2579                                             vep_offset,
2580                                             frame_complete,
2581                                             stack_slots / VMRegImpl::slots_per_word,
2582                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2583                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2584                                             oop_maps);
2585 
2586   return nm;
2587 }
2588 
2589 // this function returns the adjust size (in number of words) to a c2i adapter
2590 // activation for use during deoptimization
2591 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2592   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2593 }
2594 
2595 
2596 uint SharedRuntime::out_preserve_stack_slots() {
2597   return 0;
2598 }
2599 
2600 
2601 // Number of stack slots between incoming argument block and the start of
2602 // a new frame.  The PROLOG must add this many slots to the stack.  The
2603 // EPILOG must remove this many slots.  amd64 needs two slots for
2604 // return address.
2605 uint SharedRuntime::in_preserve_stack_slots() {
2606   return 4 + 2 * VerifyStackAtCalls;
2607 }
2608 
2609 //------------------------------generate_deopt_blob----------------------------
2610 void SharedRuntime::generate_deopt_blob() {
2611   // Allocate space for the code
2612   ResourceMark rm;
2613   // Setup code generation tools
2614   int pad = 0;
2615   if (UseAVX > 2) {
2616     pad += 1024;
2617   }
2618   if (UseAPX) {
2619     pad += 1024;
2620   }
2621 #if INCLUDE_JVMCI
2622   if (EnableJVMCI) {
2623     pad += 512; // Increase the buffer size when compiling for JVMCI
2624   }
2625 #endif
2626   const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id);
2627   CodeBuffer buffer(name, 2560+pad, 1024);
2628   MacroAssembler* masm = new MacroAssembler(&buffer);
2629   int frame_size_in_words;
2630   OopMap* map = nullptr;
2631   OopMapSet *oop_maps = new OopMapSet();
2632 
2633   // -------------
2634   // This code enters when returning to a de-optimized nmethod.  A return
2635   // address has been pushed on the stack, and return values are in
2636   // registers.
2637   // If we are doing a normal deopt then we were called from the patched
2638   // nmethod from the point we returned to the nmethod. So the return
2639   // address on the stack is wrong by NativeCall::instruction_size
2640   // We will adjust the value so it looks like we have the original return
2641   // address on the stack (like when we eagerly deoptimized).
2642   // In the case of an exception pending when deoptimizing, we enter
2643   // with a return address on the stack that points after the call we patched
2644   // into the exception handler. We have the following register state from,
2645   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2646   //    rax: exception oop
2647   //    rbx: exception handler
2648   //    rdx: throwing pc
2649   // So in this case we simply jam rdx into the useless return address and
2650   // the stack looks just like we want.
2651   //
2652   // At this point we need to de-opt.  We save the argument return
2653   // registers.  We call the first C routine, fetch_unroll_info().  This
2654   // routine captures the return values and returns a structure which
2655   // describes the current frame size and the sizes of all replacement frames.
2656   // The current frame is compiled code and may contain many inlined
2657   // functions, each with their own JVM state.  We pop the current frame, then
2658   // push all the new frames.  Then we call the C routine unpack_frames() to
2659   // populate these frames.  Finally unpack_frames() returns us the new target
2660   // address.  Notice that callee-save registers are BLOWN here; they have
2661   // already been captured in the vframeArray at the time the return PC was
2662   // patched.
2663   address start = __ pc();
2664   Label cont;
2665 
2666   // Prolog for non exception case!
2667 
2668   // Save everything in sight.
2669   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2670 
2671   // Normal deoptimization.  Save exec mode for unpack_frames.
2672   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2673   __ jmp(cont);
2674 
2675   int reexecute_offset = __ pc() - start;
2676 #if INCLUDE_JVMCI && !defined(COMPILER1)
2677   if (UseJVMCICompiler) {
2678     // JVMCI does not use this kind of deoptimization
2679     __ should_not_reach_here();
2680   }
2681 #endif
2682 
2683   // Reexecute case
2684   // return address is the pc describes what bci to do re-execute at
2685 
2686   // No need to update map as each call to save_live_registers will produce identical oopmap
2687   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2688 
2689   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2690   __ jmp(cont);
2691 
2692 #if INCLUDE_JVMCI
2693   Label after_fetch_unroll_info_call;
2694   int implicit_exception_uncommon_trap_offset = 0;
2695   int uncommon_trap_offset = 0;
2696 
2697   if (EnableJVMCI) {
2698     implicit_exception_uncommon_trap_offset = __ pc() - start;
2699 
2700     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2701     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2702 
2703     uncommon_trap_offset = __ pc() - start;
2704 
2705     // Save everything in sight.
2706     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2707     // fetch_unroll_info needs to call last_java_frame()
2708     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2709 
2710     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2711     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2712 
2713     __ movl(r14, Deoptimization::Unpack_reexecute);
2714     __ mov(c_rarg0, r15_thread);
2715     __ movl(c_rarg2, r14); // exec mode
2716     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2717     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2718 
2719     __ reset_last_Java_frame(false);
2720 
2721     __ jmp(after_fetch_unroll_info_call);
2722   } // EnableJVMCI
2723 #endif // INCLUDE_JVMCI
2724 
2725   int exception_offset = __ pc() - start;
2726 
2727   // Prolog for exception case
2728 
2729   // all registers are dead at this entry point, except for rax, and
2730   // rdx which contain the exception oop and exception pc
2731   // respectively.  Set them in TLS and fall thru to the
2732   // unpack_with_exception_in_tls entry point.
2733 
2734   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2735   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2736 
2737   int exception_in_tls_offset = __ pc() - start;
2738 
2739   // new implementation because exception oop is now passed in JavaThread
2740 
2741   // Prolog for exception case
2742   // All registers must be preserved because they might be used by LinearScan
2743   // Exceptiop oop and throwing PC are passed in JavaThread
2744   // tos: stack at point of call to method that threw the exception (i.e. only
2745   // args are on the stack, no return address)
2746 
2747   // make room on stack for the return address
2748   // It will be patched later with the throwing pc. The correct value is not
2749   // available now because loading it from memory would destroy registers.
2750   __ push(0);
2751 
2752   // Save everything in sight.
2753   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2754 
2755   // Now it is safe to overwrite any register
2756 
2757   // Deopt during an exception.  Save exec mode for unpack_frames.
2758   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2759 
2760   // load throwing pc from JavaThread and patch it as the return address
2761   // of the current frame. Then clear the field in JavaThread
2762 
2763   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2764   __ movptr(Address(rbp, wordSize), rdx);
2765   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2766 
2767 #ifdef ASSERT
2768   // verify that there is really an exception oop in JavaThread
2769   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2770   __ verify_oop(rax);
2771 
2772   // verify that there is no pending exception
2773   Label no_pending_exception;
2774   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2775   __ testptr(rax, rax);
2776   __ jcc(Assembler::zero, no_pending_exception);
2777   __ stop("must not have pending exception here");
2778   __ bind(no_pending_exception);
2779 #endif
2780 
2781   __ bind(cont);
2782 
2783   // Call C code.  Need thread and this frame, but NOT official VM entry
2784   // crud.  We cannot block on this call, no GC can happen.
2785   //
2786   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2787 
2788   // fetch_unroll_info needs to call last_java_frame().
2789 
2790   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2791 #ifdef ASSERT
2792   { Label L;
2793     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2794     __ jcc(Assembler::equal, L);
2795     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2796     __ bind(L);
2797   }
2798 #endif // ASSERT
2799   __ mov(c_rarg0, r15_thread);
2800   __ movl(c_rarg1, r14); // exec_mode
2801   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2802 
2803   // Need to have an oopmap that tells fetch_unroll_info where to
2804   // find any register it might need.
2805   oop_maps->add_gc_map(__ pc() - start, map);
2806 
2807   __ reset_last_Java_frame(false);
2808 
2809 #if INCLUDE_JVMCI
2810   if (EnableJVMCI) {
2811     __ bind(after_fetch_unroll_info_call);
2812   }
2813 #endif
2814 
2815   // Load UnrollBlock* into rdi
2816   __ mov(rdi, rax);
2817 
2818   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2819    Label noException;
2820   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2821   __ jcc(Assembler::notEqual, noException);
2822   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2823   // QQQ this is useless it was null above
2824   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2825   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2826   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2827 
2828   __ verify_oop(rax);
2829 
2830   // Overwrite the result registers with the exception results.
2831   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2832   // I think this is useless
2833   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2834 
2835   __ bind(noException);
2836 
2837   // Only register save data is on the stack.
2838   // Now restore the result registers.  Everything else is either dead
2839   // or captured in the vframeArray.
2840   RegisterSaver::restore_result_registers(masm);
2841 
2842   // All of the register save area has been popped of the stack. Only the
2843   // return address remains.
2844 
2845   // Pop all the frames we must move/replace.
2846   //
2847   // Frame picture (youngest to oldest)
2848   // 1: self-frame (no frame link)
2849   // 2: deopting frame  (no frame link)
2850   // 3: caller of deopting frame (could be compiled/interpreted).
2851   //
2852   // Note: by leaving the return address of self-frame on the stack
2853   // and using the size of frame 2 to adjust the stack
2854   // when we are done the return to frame 3 will still be on the stack.
2855 
2856   // Pop deoptimized frame
2857   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2858   __ addptr(rsp, rcx);
2859 
2860   // rsp should be pointing at the return address to the caller (3)
2861 
2862   // Pick up the initial fp we should save
2863   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2864   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2865 
2866 #ifdef ASSERT
2867   // Compilers generate code that bang the stack by as much as the
2868   // interpreter would need. So this stack banging should never
2869   // trigger a fault. Verify that it does not on non product builds.
2870   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2871   __ bang_stack_size(rbx, rcx);
2872 #endif
2873 
2874   // Load address of array of frame pcs into rcx
2875   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2876 
2877   // Trash the old pc
2878   __ addptr(rsp, wordSize);
2879 
2880   // Load address of array of frame sizes into rsi
2881   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2882 
2883   // Load counter into rdx
2884   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2885 
2886   // Now adjust the caller's stack to make up for the extra locals
2887   // but record the original sp so that we can save it in the skeletal interpreter
2888   // frame and the stack walking of interpreter_sender will get the unextended sp
2889   // value and not the "real" sp value.
2890 
2891   const Register sender_sp = r8;
2892 
2893   __ mov(sender_sp, rsp);
2894   __ movl(rbx, Address(rdi,
2895                        Deoptimization::UnrollBlock::
2896                        caller_adjustment_offset()));
2897   __ subptr(rsp, rbx);
2898 
2899   // Push interpreter frames in a loop
2900   Label loop;
2901   __ bind(loop);
2902   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2903   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2904   __ pushptr(Address(rcx, 0));          // Save return address
2905   __ enter();                           // Save old & set new ebp
2906   __ subptr(rsp, rbx);                  // Prolog
2907   // This value is corrected by layout_activation_impl
2908   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2909   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2910   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2911   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2912   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2913   __ decrementl(rdx);                   // Decrement counter
2914   __ jcc(Assembler::notZero, loop);
2915   __ pushptr(Address(rcx, 0));          // Save final return address
2916 
2917   // Re-push self-frame
2918   __ enter();                           // Save old & set new ebp
2919 
2920   // Allocate a full sized register save area.
2921   // Return address and rbp are in place, so we allocate two less words.
2922   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2923 
2924   // Restore frame locals after moving the frame
2925   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2926   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2927 
2928   // Call C code.  Need thread but NOT official VM entry
2929   // crud.  We cannot block on this call, no GC can happen.  Call should
2930   // restore return values to their stack-slots with the new SP.
2931   //
2932   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2933 
2934   // Use rbp because the frames look interpreted now
2935   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2936   // Don't need the precise return PC here, just precise enough to point into this code blob.
2937   address the_pc = __ pc();
2938   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2939 
2940   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2941   __ mov(c_rarg0, r15_thread);
2942   __ movl(c_rarg1, r14); // second arg: exec_mode
2943   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2944   // Revert SP alignment after call since we're going to do some SP relative addressing below
2945   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2946 
2947   // Set an oopmap for the call site
2948   // Use the same PC we used for the last java frame
2949   oop_maps->add_gc_map(the_pc - start,
2950                        new OopMap( frame_size_in_words, 0 ));
2951 
2952   // Clear fp AND pc
2953   __ reset_last_Java_frame(true);
2954 
2955   // Collect return values
2956   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2957   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2958   // I think this is useless (throwing pc?)
2959   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2960 
2961   // Pop self-frame.
2962   __ leave();                           // Epilog
2963 
2964   // Jump to interpreter
2965   __ ret(0);
2966 
2967   // Make sure all code is generated
2968   masm->flush();
2969 
2970   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2971   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2972 #if INCLUDE_JVMCI
2973   if (EnableJVMCI) {
2974     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2975     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2976   }
2977 #endif
2978 }
2979 
2980 //------------------------------generate_handler_blob------
2981 //
2982 // Generate a special Compile2Runtime blob that saves all registers,
2983 // and setup oopmap.
2984 //
2985 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) {
2986   assert(StubRoutines::forward_exception_entry() != nullptr,
2987          "must be generated before");
2988   assert(is_polling_page_id(id), "expected a polling page stub id");
2989 
2990   ResourceMark rm;
2991   OopMapSet *oop_maps = new OopMapSet();
2992   OopMap* map;
2993 
2994   // Allocate space for the code.  Setup code generation tools.
2995   const char* name = SharedRuntime::stub_name(id);
2996   CodeBuffer buffer(name, 2348, 1024);
2997   MacroAssembler* masm = new MacroAssembler(&buffer);
2998 
2999   address start   = __ pc();
3000   address call_pc = nullptr;
3001   int frame_size_in_words;
3002   bool cause_return = (id == SharedStubId::polling_page_return_handler_id);
3003   bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id);
3004 
3005   // Make room for return address (or push it again)
3006   if (!cause_return) {
3007     __ push(rbx);
3008   }
3009 
3010   // Save registers, fpu state, and flags
3011   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3012 
3013   // The following is basically a call_VM.  However, we need the precise
3014   // address of the call in order to generate an oopmap. Hence, we do all the
3015   // work ourselves.
3016 
3017   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3018 
3019   // The return address must always be correct so that frame constructor never
3020   // sees an invalid pc.
3021 
3022   if (!cause_return) {
3023     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3024     // Additionally, rbx is a callee saved register and we can look at it later to determine
3025     // if someone changed the return address for us!
3026     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3027     __ movptr(Address(rbp, wordSize), rbx);
3028   }
3029 
3030   // Do the call
3031   __ mov(c_rarg0, r15_thread);
3032   __ call(RuntimeAddress(call_ptr));
3033 
3034   // Set an oopmap for the call site.  This oopmap will map all
3035   // oop-registers and debug-info registers as callee-saved.  This
3036   // will allow deoptimization at this safepoint to find all possible
3037   // debug-info recordings, as well as let GC find all oops.
3038 
3039   oop_maps->add_gc_map( __ pc() - start, map);
3040 
3041   Label noException;
3042 
3043   __ reset_last_Java_frame(false);
3044 
3045   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3046   __ jcc(Assembler::equal, noException);
3047 
3048   // Exception pending
3049 
3050   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3051 
3052   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3053 
3054   // No exception case
3055   __ bind(noException);
3056 
3057   Label no_adjust;
3058 #ifdef ASSERT
3059   Label bail;
3060 #endif
3061   if (!cause_return) {
3062     Label no_prefix, not_special;
3063 
3064     // If our stashed return pc was modified by the runtime we avoid touching it
3065     __ cmpptr(rbx, Address(rbp, wordSize));
3066     __ jccb(Assembler::notEqual, no_adjust);
3067 
3068     // Skip over the poll instruction.
3069     // See NativeInstruction::is_safepoint_poll()
3070     // Possible encodings:
3071     //      85 00       test   %eax,(%rax)
3072     //      85 01       test   %eax,(%rcx)
3073     //      85 02       test   %eax,(%rdx)
3074     //      85 03       test   %eax,(%rbx)
3075     //      85 06       test   %eax,(%rsi)
3076     //      85 07       test   %eax,(%rdi)
3077     //
3078     //   41 85 00       test   %eax,(%r8)
3079     //   41 85 01       test   %eax,(%r9)
3080     //   41 85 02       test   %eax,(%r10)
3081     //   41 85 03       test   %eax,(%r11)
3082     //   41 85 06       test   %eax,(%r14)
3083     //   41 85 07       test   %eax,(%r15)
3084     //
3085     //      85 04 24    test   %eax,(%rsp)
3086     //   41 85 04 24    test   %eax,(%r12)
3087     //      85 45 00    test   %eax,0x0(%rbp)
3088     //   41 85 45 00    test   %eax,0x0(%r13)
3089 
3090     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3091     __ jcc(Assembler::notEqual, no_prefix);
3092     __ addptr(rbx, 1);
3093     __ bind(no_prefix);
3094 #ifdef ASSERT
3095     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3096 #endif
3097     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3098     // r12/rsp 0x04
3099     // r13/rbp 0x05
3100     __ movzbq(rcx, Address(rbx, 1));
3101     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3102     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3103     __ cmpptr(rcx, 1);
3104     __ jcc(Assembler::above, not_special);
3105     __ addptr(rbx, 1);
3106     __ bind(not_special);
3107 #ifdef ASSERT
3108     // Verify the correct encoding of the poll we're about to skip.
3109     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3110     __ jcc(Assembler::notEqual, bail);
3111     // Mask out the modrm bits
3112     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3113     // rax encodes to 0, so if the bits are nonzero it's incorrect
3114     __ jcc(Assembler::notZero, bail);
3115 #endif
3116     // Adjust return pc forward to step over the safepoint poll instruction
3117     __ addptr(rbx, 2);
3118     __ movptr(Address(rbp, wordSize), rbx);
3119   }
3120 
3121   __ bind(no_adjust);
3122   // Normal exit, restore registers and exit.
3123   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3124   __ ret(0);
3125 
3126 #ifdef ASSERT
3127   __ bind(bail);
3128   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3129 #endif
3130 
3131   // Make sure all code is generated
3132   masm->flush();
3133 
3134   // Fill-out other meta info
3135   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3136 }
3137 
3138 //
3139 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3140 //
3141 // Generate a stub that calls into vm to find out the proper destination
3142 // of a java call. All the argument registers are live at this point
3143 // but since this is generic code we don't know what they are and the caller
3144 // must do any gc of the args.
3145 //
3146 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) {
3147   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3148   assert(is_resolve_id(id), "expected a resolve stub id");
3149 
3150   // allocate space for the code
3151   ResourceMark rm;
3152 
3153   const char* name = SharedRuntime::stub_name(id);
3154   CodeBuffer buffer(name, 1552, 512);
3155   MacroAssembler* masm = new MacroAssembler(&buffer);
3156 
3157   int frame_size_in_words;
3158 
3159   OopMapSet *oop_maps = new OopMapSet();
3160   OopMap* map = nullptr;
3161 
3162   int start = __ offset();
3163 
3164   // No need to save vector registers since they are caller-saved anyway.
3165   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3166 
3167   int frame_complete = __ offset();
3168 
3169   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3170 
3171   __ mov(c_rarg0, r15_thread);
3172 
3173   __ call(RuntimeAddress(destination));
3174 
3175 
3176   // Set an oopmap for the call site.
3177   // We need this not only for callee-saved registers, but also for volatile
3178   // registers that the compiler might be keeping live across a safepoint.
3179 
3180   oop_maps->add_gc_map( __ offset() - start, map);
3181 
3182   // rax contains the address we are going to jump to assuming no exception got installed
3183 
3184   // clear last_Java_sp
3185   __ reset_last_Java_frame(false);
3186   // check for pending exceptions
3187   Label pending;
3188   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3189   __ jcc(Assembler::notEqual, pending);
3190 
3191   // get the returned Method*
3192   __ get_vm_result_2(rbx, r15_thread);
3193   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3194 
3195   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3196 
3197   RegisterSaver::restore_live_registers(masm);
3198 
3199   // We are back to the original state on entry and ready to go.
3200 
3201   __ jmp(rax);
3202 
3203   // Pending exception after the safepoint
3204 
3205   __ bind(pending);
3206 
3207   RegisterSaver::restore_live_registers(masm);
3208 
3209   // exception pending => remove activation and forward to exception handler
3210 
3211   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3212 
3213   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3214   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3215 
3216   // -------------
3217   // make sure all code is generated
3218   masm->flush();
3219 
3220   // return the  blob
3221   // frame_size_words or bytes??
3222   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3223 }
3224 
3225 // Continuation point for throwing of implicit exceptions that are
3226 // not handled in the current activation. Fabricates an exception
3227 // oop and initiates normal exception dispatching in this
3228 // frame. Since we need to preserve callee-saved values (currently
3229 // only for C2, but done for C1 as well) we need a callee-saved oop
3230 // map and therefore have to make these stubs into RuntimeStubs
3231 // rather than BufferBlobs.  If the compiler needs all registers to
3232 // be preserved between the fault point and the exception handler
3233 // then it must assume responsibility for that in
3234 // AbstractCompiler::continuation_for_implicit_null_exception or
3235 // continuation_for_implicit_division_by_zero_exception. All other
3236 // implicit exceptions (e.g., NullPointerException or
3237 // AbstractMethodError on entry) are either at call sites or
3238 // otherwise assume that stack unwinding will be initiated, so
3239 // caller saved registers were assumed volatile in the compiler.
3240 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) {
3241   assert(is_throw_id(id), "expected a throw stub id");
3242 
3243   const char* name = SharedRuntime::stub_name(id);
3244 
3245   // Information about frame layout at time of blocking runtime call.
3246   // Note that we only have to preserve callee-saved registers since
3247   // the compilers are responsible for supplying a continuation point
3248   // if they expect all registers to be preserved.
3249   enum layout {
3250     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3251     rbp_off2,
3252     return_off,
3253     return_off2,
3254     framesize // inclusive of return address
3255   };
3256 
3257   int insts_size = 512;
3258   int locs_size  = 64;
3259 
3260   ResourceMark rm;
3261   const char* timer_msg = "SharedRuntime generate_throw_exception";
3262   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3263 
3264   CodeBuffer code(name, insts_size, locs_size);
3265   OopMapSet* oop_maps  = new OopMapSet();
3266   MacroAssembler* masm = new MacroAssembler(&code);
3267 
3268   address start = __ pc();
3269 
3270   // This is an inlined and slightly modified version of call_VM
3271   // which has the ability to fetch the return PC out of
3272   // thread-local storage and also sets up last_Java_sp slightly
3273   // differently than the real call_VM
3274 
3275   __ enter(); // required for proper stackwalking of RuntimeStub frame
3276 
3277   assert(is_even(framesize/2), "sp not 16-byte aligned");
3278 
3279   // return address and rbp are already in place
3280   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3281 
3282   int frame_complete = __ pc() - start;
3283 
3284   // Set up last_Java_sp and last_Java_fp
3285   address the_pc = __ pc();
3286   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3287   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3288 
3289   // Call runtime
3290   __ movptr(c_rarg0, r15_thread);
3291   BLOCK_COMMENT("call runtime_entry");
3292   __ call(RuntimeAddress(runtime_entry));
3293 
3294   // Generate oop map
3295   OopMap* map = new OopMap(framesize, 0);
3296 
3297   oop_maps->add_gc_map(the_pc - start, map);
3298 
3299   __ reset_last_Java_frame(true);
3300 
3301   __ leave(); // required for proper stackwalking of RuntimeStub frame
3302 
3303   // check for pending exceptions
3304 #ifdef ASSERT
3305   Label L;
3306   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3307   __ jcc(Assembler::notEqual, L);
3308   __ should_not_reach_here();
3309   __ bind(L);
3310 #endif // ASSERT
3311   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3312 
3313 
3314   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3315   RuntimeStub* stub =
3316     RuntimeStub::new_runtime_stub(name,
3317                                   &code,
3318                                   frame_complete,
3319                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3320                                   oop_maps, false);
3321   return stub;
3322 }
3323 
3324 //------------------------------Montgomery multiplication------------------------
3325 //
3326 
3327 #ifndef _WINDOWS
3328 
3329 // Subtract 0:b from carry:a.  Return carry.
3330 static julong
3331 sub(julong a[], julong b[], julong carry, long len) {
3332   long long i = 0, cnt = len;
3333   julong tmp;
3334   asm volatile("clc; "
3335                "0: ; "
3336                "mov (%[b], %[i], 8), %[tmp]; "
3337                "sbb %[tmp], (%[a], %[i], 8); "
3338                "inc %[i]; dec %[cnt]; "
3339                "jne 0b; "
3340                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3341                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3342                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3343                : "memory");
3344   return tmp;
3345 }
3346 
3347 // Multiply (unsigned) Long A by Long B, accumulating the double-
3348 // length result into the accumulator formed of T0, T1, and T2.
3349 #define MACC(A, B, T0, T1, T2)                                  \
3350 do {                                                            \
3351   unsigned long hi, lo;                                         \
3352   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3353            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3354            : "r"(A), "a"(B) : "cc");                            \
3355  } while(0)
3356 
3357 // As above, but add twice the double-length result into the
3358 // accumulator.
3359 #define MACC2(A, B, T0, T1, T2)                                 \
3360 do {                                                            \
3361   unsigned long hi, lo;                                         \
3362   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3363            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3364            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3365            : "r"(A), "a"(B) : "cc");                            \
3366  } while(0)
3367 
3368 #else //_WINDOWS
3369 
3370 static julong
3371 sub(julong a[], julong b[], julong carry, long len) {
3372   long i;
3373   julong tmp;
3374   unsigned char c = 1;
3375   for (i = 0; i < len; i++) {
3376     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3377     a[i] = tmp;
3378   }
3379   c = _addcarry_u64(c, carry, ~0, &tmp);
3380   return tmp;
3381 }
3382 
3383 // Multiply (unsigned) Long A by Long B, accumulating the double-
3384 // length result into the accumulator formed of T0, T1, and T2.
3385 #define MACC(A, B, T0, T1, T2)                          \
3386 do {                                                    \
3387   julong hi, lo;                            \
3388   lo = _umul128(A, B, &hi);                             \
3389   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3390   c = _addcarry_u64(c, hi, T1, &T1);                    \
3391   _addcarry_u64(c, T2, 0, &T2);                         \
3392  } while(0)
3393 
3394 // As above, but add twice the double-length result into the
3395 // accumulator.
3396 #define MACC2(A, B, T0, T1, T2)                         \
3397 do {                                                    \
3398   julong hi, lo;                            \
3399   lo = _umul128(A, B, &hi);                             \
3400   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3401   c = _addcarry_u64(c, hi, T1, &T1);                    \
3402   _addcarry_u64(c, T2, 0, &T2);                         \
3403   c = _addcarry_u64(0, lo, T0, &T0);                    \
3404   c = _addcarry_u64(c, hi, T1, &T1);                    \
3405   _addcarry_u64(c, T2, 0, &T2);                         \
3406  } while(0)
3407 
3408 #endif //_WINDOWS
3409 
3410 // Fast Montgomery multiplication.  The derivation of the algorithm is
3411 // in  A Cryptographic Library for the Motorola DSP56000,
3412 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3413 
3414 static void NOINLINE
3415 montgomery_multiply(julong a[], julong b[], julong n[],
3416                     julong m[], julong inv, int len) {
3417   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3418   int i;
3419 
3420   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3421 
3422   for (i = 0; i < len; i++) {
3423     int j;
3424     for (j = 0; j < i; j++) {
3425       MACC(a[j], b[i-j], t0, t1, t2);
3426       MACC(m[j], n[i-j], t0, t1, t2);
3427     }
3428     MACC(a[i], b[0], t0, t1, t2);
3429     m[i] = t0 * inv;
3430     MACC(m[i], n[0], t0, t1, t2);
3431 
3432     assert(t0 == 0, "broken Montgomery multiply");
3433 
3434     t0 = t1; t1 = t2; t2 = 0;
3435   }
3436 
3437   for (i = len; i < 2*len; i++) {
3438     int j;
3439     for (j = i-len+1; j < len; j++) {
3440       MACC(a[j], b[i-j], t0, t1, t2);
3441       MACC(m[j], n[i-j], t0, t1, t2);
3442     }
3443     m[i-len] = t0;
3444     t0 = t1; t1 = t2; t2 = 0;
3445   }
3446 
3447   while (t0)
3448     t0 = sub(m, n, t0, len);
3449 }
3450 
3451 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3452 // multiplies so it should be up to 25% faster than Montgomery
3453 // multiplication.  However, its loop control is more complex and it
3454 // may actually run slower on some machines.
3455 
3456 static void NOINLINE
3457 montgomery_square(julong a[], julong n[],
3458                   julong m[], julong inv, int len) {
3459   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3460   int i;
3461 
3462   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3463 
3464   for (i = 0; i < len; i++) {
3465     int j;
3466     int end = (i+1)/2;
3467     for (j = 0; j < end; j++) {
3468       MACC2(a[j], a[i-j], t0, t1, t2);
3469       MACC(m[j], n[i-j], t0, t1, t2);
3470     }
3471     if ((i & 1) == 0) {
3472       MACC(a[j], a[j], t0, t1, t2);
3473     }
3474     for (; j < i; j++) {
3475       MACC(m[j], n[i-j], t0, t1, t2);
3476     }
3477     m[i] = t0 * inv;
3478     MACC(m[i], n[0], t0, t1, t2);
3479 
3480     assert(t0 == 0, "broken Montgomery square");
3481 
3482     t0 = t1; t1 = t2; t2 = 0;
3483   }
3484 
3485   for (i = len; i < 2*len; i++) {
3486     int start = i-len+1;
3487     int end = start + (len - start)/2;
3488     int j;
3489     for (j = start; j < end; j++) {
3490       MACC2(a[j], a[i-j], t0, t1, t2);
3491       MACC(m[j], n[i-j], t0, t1, t2);
3492     }
3493     if ((i & 1) == 0) {
3494       MACC(a[j], a[j], t0, t1, t2);
3495     }
3496     for (; j < len; j++) {
3497       MACC(m[j], n[i-j], t0, t1, t2);
3498     }
3499     m[i-len] = t0;
3500     t0 = t1; t1 = t2; t2 = 0;
3501   }
3502 
3503   while (t0)
3504     t0 = sub(m, n, t0, len);
3505 }
3506 
3507 // Swap words in a longword.
3508 static julong swap(julong x) {
3509   return (x << 32) | (x >> 32);
3510 }
3511 
3512 // Copy len longwords from s to d, word-swapping as we go.  The
3513 // destination array is reversed.
3514 static void reverse_words(julong *s, julong *d, int len) {
3515   d += len;
3516   while(len-- > 0) {
3517     d--;
3518     *d = swap(*s);
3519     s++;
3520   }
3521 }
3522 
3523 // The threshold at which squaring is advantageous was determined
3524 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3525 #define MONTGOMERY_SQUARING_THRESHOLD 64
3526 
3527 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3528                                         jint len, jlong inv,
3529                                         jint *m_ints) {
3530   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3531   int longwords = len/2;
3532 
3533   // Make very sure we don't use so much space that the stack might
3534   // overflow.  512 jints corresponds to an 16384-bit integer and
3535   // will use here a total of 8k bytes of stack space.
3536   int divisor = sizeof(julong) * 4;
3537   guarantee(longwords <= 8192 / divisor, "must be");
3538   int total_allocation = longwords * sizeof (julong) * 4;
3539   julong *scratch = (julong *)alloca(total_allocation);
3540 
3541   // Local scratch arrays
3542   julong
3543     *a = scratch + 0 * longwords,
3544     *b = scratch + 1 * longwords,
3545     *n = scratch + 2 * longwords,
3546     *m = scratch + 3 * longwords;
3547 
3548   reverse_words((julong *)a_ints, a, longwords);
3549   reverse_words((julong *)b_ints, b, longwords);
3550   reverse_words((julong *)n_ints, n, longwords);
3551 
3552   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3553 
3554   reverse_words(m, (julong *)m_ints, longwords);
3555 }
3556 
3557 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3558                                       jint len, jlong inv,
3559                                       jint *m_ints) {
3560   assert(len % 2 == 0, "array length in montgomery_square must be even");
3561   int longwords = len/2;
3562 
3563   // Make very sure we don't use so much space that the stack might
3564   // overflow.  512 jints corresponds to an 16384-bit integer and
3565   // will use here a total of 6k bytes of stack space.
3566   int divisor = sizeof(julong) * 3;
3567   guarantee(longwords <= (8192 / divisor), "must be");
3568   int total_allocation = longwords * sizeof (julong) * 3;
3569   julong *scratch = (julong *)alloca(total_allocation);
3570 
3571   // Local scratch arrays
3572   julong
3573     *a = scratch + 0 * longwords,
3574     *n = scratch + 1 * longwords,
3575     *m = scratch + 2 * longwords;
3576 
3577   reverse_words((julong *)a_ints, a, longwords);
3578   reverse_words((julong *)n_ints, n, longwords);
3579 
3580   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3581     ::montgomery_square(a, n, m, (julong)inv, longwords);
3582   } else {
3583     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3584   }
3585 
3586   reverse_words(m, (julong *)m_ints, longwords);
3587 }
3588 
3589 #if INCLUDE_JFR
3590 
3591 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3592 // It returns a jobject handle to the event writer.
3593 // The handle is dereferenced and the return value is the event writer oop.
3594 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3595   enum layout {
3596     rbp_off,
3597     rbpH_off,
3598     return_off,
3599     return_off2,
3600     framesize // inclusive of return address
3601   };
3602 
3603   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id);
3604   CodeBuffer code(name, 1024, 64);
3605   MacroAssembler* masm = new MacroAssembler(&code);
3606   address start = __ pc();
3607 
3608   __ enter();
3609   address the_pc = __ pc();
3610 
3611   int frame_complete = the_pc - start;
3612 
3613   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3614   __ movptr(c_rarg0, r15_thread);
3615   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3616   __ reset_last_Java_frame(true);
3617 
3618   // rax is jobject handle result, unpack and process it through a barrier.
3619   __ resolve_global_jobject(rax, r15_thread, c_rarg0);
3620 
3621   __ leave();
3622   __ ret(0);
3623 
3624   OopMapSet* oop_maps = new OopMapSet();
3625   OopMap* map = new OopMap(framesize, 1);
3626   oop_maps->add_gc_map(frame_complete, map);
3627 
3628   RuntimeStub* stub =
3629     RuntimeStub::new_runtime_stub(name,
3630                                   &code,
3631                                   frame_complete,
3632                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3633                                   oop_maps,
3634                                   false);
3635   return stub;
3636 }
3637 
3638 // For c2: call to return a leased buffer.
3639 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3640   enum layout {
3641     rbp_off,
3642     rbpH_off,
3643     return_off,
3644     return_off2,
3645     framesize // inclusive of return address
3646   };
3647 
3648   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id);
3649   CodeBuffer code(name, 1024, 64);
3650   MacroAssembler* masm = new MacroAssembler(&code);
3651   address start = __ pc();
3652 
3653   __ enter();
3654   address the_pc = __ pc();
3655 
3656   int frame_complete = the_pc - start;
3657 
3658   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3659   __ movptr(c_rarg0, r15_thread);
3660   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3661   __ reset_last_Java_frame(true);
3662 
3663   __ leave();
3664   __ ret(0);
3665 
3666   OopMapSet* oop_maps = new OopMapSet();
3667   OopMap* map = new OopMap(framesize, 1);
3668   oop_maps->add_gc_map(frame_complete, map);
3669 
3670   RuntimeStub* stub =
3671     RuntimeStub::new_runtime_stub(name,
3672                                   &code,
3673                                   frame_complete,
3674                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3675                                   oop_maps,
3676                                   false);
3677   return stub;
3678 }
3679 
3680 #endif // INCLUDE_JFR
3681