1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "classfile/symbolTable.hpp"
  31 #include "code/aotCodeCache.hpp"
  32 #include "code/compiledIC.hpp"
  33 #include "code/debugInfoRec.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "oops/method.inline.hpp"
  47 #include "prims/methodHandles.hpp"
  48 #include "runtime/continuation.hpp"
  49 #include "runtime/continuationEntry.inline.hpp"
  50 #include "runtime/globals.hpp"
  51 #include "runtime/jniHandles.hpp"
  52 #include "runtime/safepointMechanism.hpp"
  53 #include "runtime/sharedRuntime.hpp"
  54 #include "runtime/signature.hpp"
  55 #include "runtime/stubRoutines.hpp"
  56 #include "runtime/timerTrace.hpp"
  57 #include "runtime/vframeArray.hpp"
  58 #include "runtime/vm_version.hpp"
  59 #include "utilities/align.hpp"
  60 #include "utilities/checkedCast.hpp"
  61 #include "utilities/formatBuffer.hpp"
  62 #include "vmreg_x86.inline.hpp"
  63 #ifdef COMPILER1
  64 #include "c1/c1_Runtime1.hpp"
  65 #endif
  66 #ifdef COMPILER2
  67 #include "opto/runtime.hpp"
  68 #endif
  69 #if INCLUDE_JVMCI
  70 #include "jvmci/jvmciJavaClasses.hpp"
  71 #endif
  72 
  73 #define __ masm->
  74 
  75 #ifdef PRODUCT
  76 #define BLOCK_COMMENT(str) /* nothing */
  77 #else
  78 #define BLOCK_COMMENT(str) __ block_comment(str)
  79 #endif // PRODUCT
  80 
  81 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  82 
  83 class RegisterSaver {
  84   // Capture info about frame layout.  Layout offsets are in jint
  85   // units because compiler frame slots are jints.
  86 #define XSAVE_AREA_BEGIN 160
  87 #define XSAVE_AREA_YMM_BEGIN 576
  88 #define XSAVE_AREA_EGPRS 960
  89 #define XSAVE_AREA_OPMASK_BEGIN 1088
  90 #define XSAVE_AREA_ZMM_BEGIN 1152
  91 #define XSAVE_AREA_UPPERBANK 1664
  92 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  93 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  94 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  95 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  96 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  97   enum layout {
  98     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  99     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
 100     DEF_XMM_OFFS(0),
 101     DEF_XMM_OFFS(1),
 102     // 2..15 are implied in range usage
 103     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 104     DEF_YMM_OFFS(0),
 105     DEF_YMM_OFFS(1),
 106     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 107     r16H_off,
 108     r17_off, r17H_off,
 109     r18_off, r18H_off,
 110     r19_off, r19H_off,
 111     r20_off, r20H_off,
 112     r21_off, r21H_off,
 113     r22_off, r22H_off,
 114     r23_off, r23H_off,
 115     r24_off, r24H_off,
 116     r25_off, r25H_off,
 117     r26_off, r26H_off,
 118     r27_off, r27H_off,
 119     r28_off, r28H_off,
 120     r29_off, r29H_off,
 121     r30_off, r30H_off,
 122     r31_off, r31H_off,
 123     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 124     DEF_OPMASK_OFFS(0),
 125     DEF_OPMASK_OFFS(1),
 126     // 2..7 are implied in range usage
 127     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 128     DEF_ZMM_OFFS(0),
 129     DEF_ZMM_OFFS(1),
 130     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 131     DEF_ZMM_UPPER_OFFS(16),
 132     DEF_ZMM_UPPER_OFFS(17),
 133     // 18..31 are implied in range usage
 134     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 135     fpu_stateH_end,
 136     r15_off, r15H_off,
 137     r14_off, r14H_off,
 138     r13_off, r13H_off,
 139     r12_off, r12H_off,
 140     r11_off, r11H_off,
 141     r10_off, r10H_off,
 142     r9_off,  r9H_off,
 143     r8_off,  r8H_off,
 144     rdi_off, rdiH_off,
 145     rsi_off, rsiH_off,
 146     ignore_off, ignoreH_off,  // extra copy of rbp
 147     rsp_off, rspH_off,
 148     rbx_off, rbxH_off,
 149     rdx_off, rdxH_off,
 150     rcx_off, rcxH_off,
 151     rax_off, raxH_off,
 152     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 153     align_off, alignH_off,
 154     flags_off, flagsH_off,
 155     // The frame sender code expects that rbp will be in the "natural" place and
 156     // will override any oopMap setting for it. We must therefore force the layout
 157     // so that it agrees with the frame sender code.
 158     rbp_off, rbpH_off,        // copy of rbp we will restore
 159     return_off, returnH_off,  // slot for return address
 160     reg_save_size             // size in compiler stack slots
 161   };
 162 
 163  public:
 164   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 165   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 166 
 167   // Offsets into the register save area
 168   // Used by deoptimization when it is managing result register
 169   // values on its own
 170 
 171   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 172   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 173   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 174   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 175   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 176   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 177 
 178   // During deoptimization only the result registers need to be restored,
 179   // all the other values have already been extracted.
 180   static void restore_result_registers(MacroAssembler* masm);
 181 };
 182 
 183 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 184   int off = 0;
 185   int num_xmm_regs = XMMRegister::available_xmm_registers();
 186 #if COMPILER2_OR_JVMCI
 187   if (save_wide_vectors && UseAVX == 0) {
 188     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 189   }
 190   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 191 #else
 192   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 193 #endif
 194 
 195   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 196   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 197   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 198   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 199   // CodeBlob frame size is in words.
 200   int frame_size_in_words = frame_size_in_bytes / wordSize;
 201   *total_frame_words = frame_size_in_words;
 202 
 203   // Save registers, fpu state, and flags.
 204   // We assume caller has already pushed the return address onto the
 205   // stack, so rsp is 8-byte aligned here.
 206   // We push rpb twice in this sequence because we want the real rbp
 207   // to be under the return like a normal enter.
 208 
 209   __ enter();          // rsp becomes 16-byte aligned here
 210   __ pushf();
 211   // Make sure rsp stays 16-byte aligned
 212   __ subq(rsp, 8);
 213   // Push CPU state in multiple of 16 bytes
 214   __ save_legacy_gprs();
 215   __ push_FPU_state();
 216 
 217 
 218   // push cpu state handles this on EVEX enabled targets
 219   if (save_wide_vectors) {
 220     // Save upper half of YMM registers(0..15)
 221     int base_addr = XSAVE_AREA_YMM_BEGIN;
 222     for (int n = 0; n < 16; n++) {
 223       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 224     }
 225     if (VM_Version::supports_evex()) {
 226       // Save upper half of ZMM registers(0..15)
 227       base_addr = XSAVE_AREA_ZMM_BEGIN;
 228       for (int n = 0; n < 16; n++) {
 229         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 230       }
 231       // Save full ZMM registers(16..num_xmm_regs)
 232       base_addr = XSAVE_AREA_UPPERBANK;
 233       off = 0;
 234       int vector_len = Assembler::AVX_512bit;
 235       for (int n = 16; n < num_xmm_regs; n++) {
 236         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 237       }
 238 #if COMPILER2_OR_JVMCI
 239       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 240       off = 0;
 241       for(int n = 0; n < KRegister::number_of_registers; n++) {
 242         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 243       }
 244 #endif
 245     }
 246   } else {
 247     if (VM_Version::supports_evex()) {
 248       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 249       int base_addr = XSAVE_AREA_UPPERBANK;
 250       off = 0;
 251       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 252       for (int n = 16; n < num_xmm_regs; n++) {
 253         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 254       }
 255 #if COMPILER2_OR_JVMCI
 256       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 257       off = 0;
 258       for(int n = 0; n < KRegister::number_of_registers; n++) {
 259         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 260       }
 261 #endif
 262     }
 263   }
 264 
 265 #if COMPILER2_OR_JVMCI
 266   if (UseAPX) {
 267       int base_addr = XSAVE_AREA_EGPRS;
 268       off = 0;
 269       for (int n = 16; n < Register::number_of_registers; n++) {
 270         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 271       }
 272   }
 273 #endif
 274 
 275   __ vzeroupper();
 276   if (frame::arg_reg_save_area_bytes != 0) {
 277     // Allocate argument register save area
 278     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 279   }
 280 
 281   // Set an oopmap for the call site.  This oopmap will map all
 282   // oop-registers and debug-info registers as callee-saved.  This
 283   // will allow deoptimization at this safepoint to find all possible
 284   // debug-info recordings, as well as let GC find all oops.
 285 
 286   OopMapSet *oop_maps = new OopMapSet();
 287   OopMap* map = new OopMap(frame_size_in_slots, 0);
 288 
 289 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 290 
 291   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 294   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 295   // rbp location is known implicitly by the frame sender code, needs no oopmap
 296   // and the location where rbp was saved by is ignored
 297   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 306   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 307 
 308   if (UseAPX) {
 309     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 324     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 325   }
 326   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 327   // on EVEX enabled targets, we get it included in the xsave area
 328   off = xmm0_off;
 329   int delta = xmm1_off - off;
 330   for (int n = 0; n < 16; n++) {
 331     XMMRegister xmm_name = as_XMMRegister(n);
 332     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 333     off += delta;
 334   }
 335   if (UseAVX > 2) {
 336     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 337     off = zmm16_off;
 338     delta = zmm17_off - off;
 339     for (int n = 16; n < num_xmm_regs; n++) {
 340       XMMRegister zmm_name = as_XMMRegister(n);
 341       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 342       off += delta;
 343     }
 344   }
 345 
 346 #if COMPILER2_OR_JVMCI
 347   if (save_wide_vectors) {
 348     // Save upper half of YMM registers(0..15)
 349     off = ymm0_off;
 350     delta = ymm1_off - ymm0_off;
 351     for (int n = 0; n < 16; n++) {
 352       XMMRegister ymm_name = as_XMMRegister(n);
 353       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 354       off += delta;
 355     }
 356     if (VM_Version::supports_evex()) {
 357       // Save upper half of ZMM registers(0..15)
 358       off = zmm0_off;
 359       delta = zmm1_off - zmm0_off;
 360       for (int n = 0; n < 16; n++) {
 361         XMMRegister zmm_name = as_XMMRegister(n);
 362         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 363         off += delta;
 364       }
 365     }
 366   }
 367 #endif // COMPILER2_OR_JVMCI
 368 
 369   // %%% These should all be a waste but we'll keep things as they were for now
 370   if (true) {
 371     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 374     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 375     // rbp location is known implicitly by the frame sender code, needs no oopmap
 376     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 385     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 386     if (UseAPX) {
 387       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 402       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 403     }
 404     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 405     // on EVEX enabled targets, we get it included in the xsave area
 406     off = xmm0H_off;
 407     delta = xmm1H_off - off;
 408     for (int n = 0; n < 16; n++) {
 409       XMMRegister xmm_name = as_XMMRegister(n);
 410       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 411       off += delta;
 412     }
 413     if (UseAVX > 2) {
 414       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 415       off = zmm16H_off;
 416       delta = zmm17H_off - off;
 417       for (int n = 16; n < num_xmm_regs; n++) {
 418         XMMRegister zmm_name = as_XMMRegister(n);
 419         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 420         off += delta;
 421       }
 422     }
 423   }
 424 
 425   return map;
 426 }
 427 
 428 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 429   int num_xmm_regs = XMMRegister::available_xmm_registers();
 430   if (frame::arg_reg_save_area_bytes != 0) {
 431     // Pop arg register save area
 432     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 433   }
 434 
 435 #if COMPILER2_OR_JVMCI
 436   if (restore_wide_vectors) {
 437     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 438     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 439   }
 440 #else
 441   assert(!restore_wide_vectors, "vectors are generated only by C2");
 442 #endif
 443 
 444   __ vzeroupper();
 445 
 446   // On EVEX enabled targets everything is handled in pop fpu state
 447   if (restore_wide_vectors) {
 448     // Restore upper half of YMM registers (0..15)
 449     int base_addr = XSAVE_AREA_YMM_BEGIN;
 450     for (int n = 0; n < 16; n++) {
 451       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 452     }
 453     if (VM_Version::supports_evex()) {
 454       // Restore upper half of ZMM registers (0..15)
 455       base_addr = XSAVE_AREA_ZMM_BEGIN;
 456       for (int n = 0; n < 16; n++) {
 457         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 458       }
 459       // Restore full ZMM registers(16..num_xmm_regs)
 460       base_addr = XSAVE_AREA_UPPERBANK;
 461       int vector_len = Assembler::AVX_512bit;
 462       int off = 0;
 463       for (int n = 16; n < num_xmm_regs; n++) {
 464         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 465       }
 466 #if COMPILER2_OR_JVMCI
 467       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 468       off = 0;
 469       for (int n = 0; n < KRegister::number_of_registers; n++) {
 470         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 471       }
 472 #endif
 473     }
 474   } else {
 475     if (VM_Version::supports_evex()) {
 476       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 477       int base_addr = XSAVE_AREA_UPPERBANK;
 478       int off = 0;
 479       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 480       for (int n = 16; n < num_xmm_regs; n++) {
 481         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 482       }
 483 #if COMPILER2_OR_JVMCI
 484       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 485       off = 0;
 486       for (int n = 0; n < KRegister::number_of_registers; n++) {
 487         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 488       }
 489 #endif
 490     }
 491   }
 492 
 493 #if COMPILER2_OR_JVMCI
 494   if (UseAPX) {
 495     int base_addr = XSAVE_AREA_EGPRS;
 496     int off = 0;
 497     for (int n = 16; n < Register::number_of_registers; n++) {
 498       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 499     }
 500   }
 501 #endif
 502 
 503   // Recover CPU state
 504   __ pop_FPU_state();
 505   __ restore_legacy_gprs();
 506   __ addq(rsp, 8);
 507   __ popf();
 508   // Get the rbp described implicitly by the calling convention (no oopMap)
 509   __ pop(rbp);
 510 }
 511 
 512 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 513 
 514   // Just restore result register. Only used by deoptimization. By
 515   // now any callee save register that needs to be restored to a c2
 516   // caller of the deoptee has been extracted into the vframeArray
 517   // and will be stuffed into the c2i adapter we create for later
 518   // restoration so only result registers need to be restored here.
 519 
 520   // Restore fp result register
 521   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 522   // Restore integer result register
 523   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 524   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 525 
 526   // Pop all of the register save are off the stack except the return address
 527   __ addptr(rsp, return_offset_in_bytes());
 528 }
 529 
 530 // Is vector's size (in bytes) bigger than a size saved by default?
 531 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 532 bool SharedRuntime::is_wide_vector(int size) {
 533   return size > 16;
 534 }
 535 
 536 // ---------------------------------------------------------------------------
 537 // Read the array of BasicTypes from a signature, and compute where the
 538 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 539 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 540 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 541 // as framesizes are fixed.
 542 // VMRegImpl::stack0 refers to the first slot 0(sp).
 543 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 544 // Register up to Register::number_of_registers are the 64-bit
 545 // integer registers.
 546 
 547 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 548 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 549 // units regardless of build. Of course for i486 there is no 64 bit build
 550 
 551 // The Java calling convention is a "shifted" version of the C ABI.
 552 // By skipping the first C ABI register we can call non-static jni methods
 553 // with small numbers of arguments without having to shuffle the arguments
 554 // at all. Since we control the java ABI we ought to at least get some
 555 // advantage out of it.
 556 
 557 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 558                                            VMRegPair *regs,
 559                                            int total_args_passed) {
 560 
 561   // Create the mapping between argument positions and
 562   // registers.
 563   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 564     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 565   };
 566   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 567     j_farg0, j_farg1, j_farg2, j_farg3,
 568     j_farg4, j_farg5, j_farg6, j_farg7
 569   };
 570 
 571 
 572   uint int_args = 0;
 573   uint fp_args = 0;
 574   uint stk_args = 0;
 575 
 576   for (int i = 0; i < total_args_passed; i++) {
 577     switch (sig_bt[i]) {
 578     case T_BOOLEAN:
 579     case T_CHAR:
 580     case T_BYTE:
 581     case T_SHORT:
 582     case T_INT:
 583       if (int_args < Argument::n_int_register_parameters_j) {
 584         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 585       } else {
 586         stk_args = align_up(stk_args, 2);
 587         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 588         stk_args += 1;
 589       }
 590       break;
 591     case T_VOID:
 592       // halves of T_LONG or T_DOUBLE
 593       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 594       regs[i].set_bad();
 595       break;
 596     case T_LONG:
 597       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 598       // fall through
 599     case T_OBJECT:
 600     case T_ARRAY:
 601     case T_ADDRESS:
 602       if (int_args < Argument::n_int_register_parameters_j) {
 603         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 604       } else {
 605         stk_args = align_up(stk_args, 2);
 606         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 607         stk_args += 2;
 608       }
 609       break;
 610     case T_FLOAT:
 611       if (fp_args < Argument::n_float_register_parameters_j) {
 612         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 613       } else {
 614         stk_args = align_up(stk_args, 2);
 615         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 616         stk_args += 1;
 617       }
 618       break;
 619     case T_DOUBLE:
 620       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 621       if (fp_args < Argument::n_float_register_parameters_j) {
 622         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 623       } else {
 624         stk_args = align_up(stk_args, 2);
 625         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 626         stk_args += 2;
 627       }
 628       break;
 629     default:
 630       ShouldNotReachHere();
 631       break;
 632     }
 633   }
 634 
 635   return stk_args;
 636 }
 637 
 638 // Same as java_calling_convention() but for multiple return
 639 // values. There's no way to store them on the stack so if we don't
 640 // have enough registers, multiple values can't be returned.
 641 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 642 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 643 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 644                                           VMRegPair *regs,
 645                                           int total_args_passed) {
 646   // Create the mapping between argument positions and
 647   // registers.
 648   static const Register INT_ArgReg[java_return_convention_max_int] = {
 649     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 650   };
 651   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 652     j_farg0, j_farg1, j_farg2, j_farg3,
 653     j_farg4, j_farg5, j_farg6, j_farg7
 654   };
 655 
 656 
 657   uint int_args = 0;
 658   uint fp_args = 0;
 659 
 660   for (int i = 0; i < total_args_passed; i++) {
 661     switch (sig_bt[i]) {
 662     case T_BOOLEAN:
 663     case T_CHAR:
 664     case T_BYTE:
 665     case T_SHORT:
 666     case T_INT:
 667       if (int_args < Argument::n_int_register_parameters_j+1) {
 668         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 669         int_args++;
 670       } else {
 671         return -1;
 672       }
 673       break;
 674     case T_VOID:
 675       // halves of T_LONG or T_DOUBLE
 676       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 677       regs[i].set_bad();
 678       break;
 679     case T_LONG:
 680       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 681       // fall through
 682     case T_OBJECT:
 683     case T_ARRAY:
 684     case T_ADDRESS:
 685     case T_METADATA:
 686       if (int_args < Argument::n_int_register_parameters_j+1) {
 687         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 688         int_args++;
 689       } else {
 690         return -1;
 691       }
 692       break;
 693     case T_FLOAT:
 694       if (fp_args < Argument::n_float_register_parameters_j) {
 695         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 696         fp_args++;
 697       } else {
 698         return -1;
 699       }
 700       break;
 701     case T_DOUBLE:
 702       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 703       if (fp_args < Argument::n_float_register_parameters_j) {
 704         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 705         fp_args++;
 706       } else {
 707         return -1;
 708       }
 709       break;
 710     default:
 711       ShouldNotReachHere();
 712       break;
 713     }
 714   }
 715 
 716   return int_args + fp_args;
 717 }
 718 
 719 // Patch the callers callsite with entry to compiled code if it exists.
 720 static void patch_callers_callsite(MacroAssembler *masm) {
 721   Label L;
 722   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 723   __ jcc(Assembler::equal, L);
 724 
 725   // Save the current stack pointer
 726   __ mov(r13, rsp);
 727   // Schedule the branch target address early.
 728   // Call into the VM to patch the caller, then jump to compiled callee
 729   // rax isn't live so capture return address while we easily can
 730   __ movptr(rax, Address(rsp, 0));
 731 
 732   // align stack so push_CPU_state doesn't fault
 733   __ andptr(rsp, -(StackAlignmentInBytes));
 734   __ push_CPU_state();
 735   __ vzeroupper();
 736   // VM needs caller's callsite
 737   // VM needs target method
 738   // This needs to be a long call since we will relocate this adapter to
 739   // the codeBuffer and it may not reach
 740 
 741   // Allocate argument register save area
 742   if (frame::arg_reg_save_area_bytes != 0) {
 743     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 744   }
 745   __ mov(c_rarg0, rbx);
 746   __ mov(c_rarg1, rax);
 747   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 748 
 749   // De-allocate argument register save area
 750   if (frame::arg_reg_save_area_bytes != 0) {
 751     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 752   }
 753 
 754   __ vzeroupper();
 755   __ pop_CPU_state();
 756   // restore sp
 757   __ mov(rsp, r13);
 758   __ bind(L);
 759 }
 760 
 761 // For each inline type argument, sig includes the list of fields of
 762 // the inline type. This utility function computes the number of
 763 // arguments for the call if inline types are passed by reference (the
 764 // calling convention the interpreter expects).
 765 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 766   int total_args_passed = 0;
 767   if (InlineTypePassFieldsAsArgs) {
 768     for (int i = 0; i < sig_extended->length(); i++) {
 769       BasicType bt = sig_extended->at(i)._bt;
 770       if (bt == T_METADATA) {
 771         // In sig_extended, an inline type argument starts with:
 772         // T_METADATA, followed by the types of the fields of the
 773         // inline type and T_VOID to mark the end of the value
 774         // type. Inline types are flattened so, for instance, in the
 775         // case of an inline type with an int field and an inline type
 776         // field that itself has 2 fields, an int and a long:
 777         // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
 778         // slot for the T_LONG) T_VOID (inner inline type) T_VOID
 779         // (outer inline type)
 780         total_args_passed++;
 781         int vt = 1;
 782         do {
 783           i++;
 784           BasicType bt = sig_extended->at(i)._bt;
 785           BasicType prev_bt = sig_extended->at(i-1)._bt;
 786           if (bt == T_METADATA) {
 787             vt++;
 788           } else if (bt == T_VOID &&
 789                      prev_bt != T_LONG &&
 790                      prev_bt != T_DOUBLE) {
 791             vt--;
 792           }
 793         } while (vt != 0);
 794       } else {
 795         total_args_passed++;
 796       }
 797     }
 798   } else {
 799     total_args_passed = sig_extended->length();
 800   }
 801   return total_args_passed;
 802 }
 803 
 804 
 805 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 806                                    BasicType bt,
 807                                    BasicType prev_bt,
 808                                    size_t size_in_bytes,
 809                                    const VMRegPair& reg_pair,
 810                                    const Address& to,
 811                                    int extraspace,
 812                                    bool is_oop) {
 813   if (bt == T_VOID) {
 814     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 815     return;
 816   }
 817 
 818   // Say 4 args:
 819   // i   st_off
 820   // 0   32 T_LONG
 821   // 1   24 T_VOID
 822   // 2   16 T_OBJECT
 823   // 3    8 T_BOOL
 824   // -    0 return address
 825   //
 826   // However to make thing extra confusing. Because we can fit a long/double in
 827   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 828   // leaves one slot empty and only stores to a single slot. In this case the
 829   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 830 
 831   bool wide = (size_in_bytes == wordSize);
 832   VMReg r_1 = reg_pair.first();
 833   VMReg r_2 = reg_pair.second();
 834   assert(r_2->is_valid() == wide, "invalid size");
 835   if (!r_1->is_valid()) {
 836     assert(!r_2->is_valid(), "must be invalid");
 837     return;
 838   }
 839 
 840   if (!r_1->is_XMMRegister()) {
 841     Register val = rax;
 842     if (r_1->is_stack()) {
 843       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 844       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 845     } else {
 846       val = r_1->as_Register();
 847     }
 848     assert_different_registers(to.base(), val, rscratch1);
 849     if (is_oop) {
 850       __ push(r13);
 851       __ push(rbx);
 852       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 853       __ pop(rbx);
 854       __ pop(r13);
 855     } else {
 856       __ store_sized_value(to, val, size_in_bytes);
 857     }
 858   } else {
 859     if (wide) {
 860       __ movdbl(to, r_1->as_XMMRegister());
 861     } else {
 862       __ movflt(to, r_1->as_XMMRegister());
 863     }
 864   }
 865 }
 866 
 867 static void gen_c2i_adapter(MacroAssembler *masm,
 868                             const GrowableArray<SigEntry>* sig_extended,
 869                             const VMRegPair *regs,
 870                             bool requires_clinit_barrier,
 871                             address& c2i_no_clinit_check_entry,
 872                             Label& skip_fixup,
 873                             address start,
 874                             OopMapSet* oop_maps,
 875                             int& frame_complete,
 876                             int& frame_size_in_words,
 877                             bool alloc_inline_receiver) {
 878   if (requires_clinit_barrier && VM_Version::supports_fast_class_init_checks()) {
 879     Label L_skip_barrier;
 880     Register method = rbx;
 881 
 882     { // Bypass the barrier for non-static methods
 883       Register flags = rscratch1;
 884       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
 885       __ testl(flags, JVM_ACC_STATIC);
 886       __ jcc(Assembler::zero, L_skip_barrier); // non-static
 887     }
 888 
 889     Register klass = rscratch1;
 890     __ load_method_holder(klass, method);
 891     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
 892 
 893     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
 894 
 895     __ bind(L_skip_barrier);
 896     c2i_no_clinit_check_entry = __ pc();
 897   }
 898 
 899   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 900   bs->c2i_entry_barrier(masm);
 901 
 902   // Before we get into the guts of the C2I adapter, see if we should be here
 903   // at all.  We've come from compiled code and are attempting to jump to the
 904   // interpreter, which means the caller made a static call to get here
 905   // (vcalls always get a compiled target if there is one).  Check for a
 906   // compiled target.  If there is one, we need to patch the caller's call.
 907   patch_callers_callsite(masm);
 908 
 909   __ bind(skip_fixup);
 910 
 911   if (InlineTypePassFieldsAsArgs) {
 912     // Is there an inline type argument?
 913     bool has_inline_argument = false;
 914     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 915       has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
 916     }
 917     if (has_inline_argument) {
 918       // There is at least an inline type argument: we're coming from
 919       // compiled code so we have no buffers to back the inline types.
 920       // Allocate the buffers here with a runtime call.
 921       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
 922 
 923       frame_complete = __ offset();
 924 
 925       __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
 926 
 927       __ mov(c_rarg0, r15_thread);
 928       __ mov(c_rarg1, rbx);
 929       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 930       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 931 
 932       oop_maps->add_gc_map((int)(__ pc() - start), map);
 933       __ reset_last_Java_frame(false);
 934 
 935       RegisterSaver::restore_live_registers(masm);
 936 
 937       Label no_exception;
 938       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 939       __ jcc(Assembler::equal, no_exception);
 940 
 941       __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
 942       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 943       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 944 
 945       __ bind(no_exception);
 946 
 947       // We get an array of objects from the runtime call
 948       __ get_vm_result_oop(rscratch2); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 949       __ get_vm_result_metadata(rbx); // TODO: required to keep the callee Method live?
 950     }
 951   }
 952 
 953   // Since all args are passed on the stack, total_args_passed *
 954   // Interpreter::stackElementSize is the space we need.
 955   int total_args_passed = compute_total_args_passed_int(sig_extended);
 956   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 957 
 958   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 959 
 960   // stack is aligned, keep it that way
 961   // This is not currently needed or enforced by the interpreter, but
 962   // we might as well conform to the ABI.
 963   extraspace = align_up(extraspace, 2*wordSize);
 964 
 965   // set senderSP value
 966   __ lea(r13, Address(rsp, wordSize));
 967 
 968 #ifdef ASSERT
 969   __ check_stack_alignment(r13, "sender stack not aligned");
 970 #endif
 971   if (extraspace > 0) {
 972     // Pop the return address
 973     __ pop(rax);
 974 
 975     __ subptr(rsp, extraspace);
 976 
 977     // Push the return address
 978     __ push(rax);
 979 
 980     // Account for the return address location since we store it first rather
 981     // than hold it in a register across all the shuffling
 982     extraspace += wordSize;
 983   }
 984 
 985 #ifdef ASSERT
 986   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 987 #endif
 988 
 989   // Now write the args into the outgoing interpreter space
 990 
 991   // next_arg_comp is the next argument from the compiler point of
 992   // view (inline type fields are passed in registers/on the stack). In
 993   // sig_extended, an inline type argument starts with: T_METADATA,
 994   // followed by the types of the fields of the inline type and T_VOID
 995   // to mark the end of the inline type. ignored counts the number of
 996   // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
 997   // used to get the buffer for that argument from the pool of buffers
 998   // we allocated above and want to pass to the
 999   // interpreter. next_arg_int is the next argument from the
1000   // interpreter point of view (inline types are passed by reference).
1001   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
1002        next_arg_comp < sig_extended->length(); next_arg_comp++) {
1003     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
1004     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
1005     BasicType bt = sig_extended->at(next_arg_comp)._bt;
1006     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
1007     if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
1008       int next_off = st_off - Interpreter::stackElementSize;
1009       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
1010       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
1011       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
1012       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1013                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
1014       next_arg_int++;
1015 #ifdef ASSERT
1016       if (bt == T_LONG || bt == T_DOUBLE) {
1017         // Overwrite the unused slot with known junk
1018         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
1019         __ movptr(Address(rsp, st_off), rax);
1020       }
1021 #endif /* ASSERT */
1022     } else {
1023       ignored++;
1024       // get the buffer from the just allocated pool of buffers
1025       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
1026       __ load_heap_oop(r14, Address(rscratch2, index));
1027       next_vt_arg++; next_arg_int++;
1028       int vt = 1;
1029       // write fields we get from compiled code in registers/stack
1030       // slots to the buffer: we know we are done with that inline type
1031       // argument when we hit the T_VOID that acts as an end of inline
1032       // type delimiter for this inline type. Inline types are flattened
1033       // so we might encounter embedded inline types. Each entry in
1034       // sig_extended contains a field offset in the buffer.
1035       Label L_null;
1036       do {
1037         next_arg_comp++;
1038         BasicType bt = sig_extended->at(next_arg_comp)._bt;
1039         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
1040         if (bt == T_METADATA) {
1041           vt++;
1042           ignored++;
1043         } else if (bt == T_VOID &&
1044                    prev_bt != T_LONG &&
1045                    prev_bt != T_DOUBLE) {
1046           vt--;
1047           ignored++;
1048         } else {
1049           int off = sig_extended->at(next_arg_comp)._offset;
1050           if (off == -1) {
1051             // Nullable inline type argument, emit null check
1052             VMReg reg = regs[next_arg_comp-ignored].first();
1053             Label L_notNull;
1054             if (reg->is_stack()) {
1055               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1056               __ testb(Address(rsp, ld_off), 1);
1057             } else {
1058               __ testb(reg->as_Register(), 1);
1059             }
1060             __ jcc(Assembler::notZero, L_notNull);
1061             __ movptr(Address(rsp, st_off), 0);
1062             __ jmp(L_null);
1063             __ bind(L_notNull);
1064             continue;
1065           }
1066           assert(off > 0, "offset in object should be positive");
1067           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
1068           bool is_oop = is_reference_type(bt);
1069           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1070                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
1071         }
1072       } while (vt != 0);
1073       // pass the buffer to the interpreter
1074       __ movptr(Address(rsp, st_off), r14);
1075       __ bind(L_null);
1076     }
1077   }
1078 
1079   // Schedule the branch target address early.
1080   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1081   __ jmp(rcx);
1082 }
1083 
1084 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1085                                     int comp_args_on_stack,
1086                                     const GrowableArray<SigEntry>* sig,
1087                                     const VMRegPair *regs) {
1088 
1089   // Note: r13 contains the senderSP on entry. We must preserve it since
1090   // we may do a i2c -> c2i transition if we lose a race where compiled
1091   // code goes non-entrant while we get args ready.
1092   // In addition we use r13 to locate all the interpreter args as
1093   // we must align the stack to 16 bytes on an i2c entry else we
1094   // lose alignment we expect in all compiled code and register
1095   // save code can segv when fxsave instructions find improperly
1096   // aligned stack pointer.
1097 
1098   // Adapters can be frameless because they do not require the caller
1099   // to perform additional cleanup work, such as correcting the stack pointer.
1100   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1101   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1102   // even if a callee has modified the stack pointer.
1103   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1104   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1105   // up via the senderSP register).
1106   // In other words, if *either* the caller or callee is interpreted, we can
1107   // get the stack pointer repaired after a call.
1108   // This is why c2i and i2c adapters cannot be indefinitely composed.
1109   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1110   // both caller and callee would be compiled methods, and neither would
1111   // clean up the stack pointer changes performed by the two adapters.
1112   // If this happens, control eventually transfers back to the compiled
1113   // caller, but with an uncorrected stack, causing delayed havoc.
1114 
1115   // Must preserve original SP for loading incoming arguments because
1116   // we need to align the outgoing SP for compiled code.
1117   __ movptr(r11, rsp);
1118 
1119   // Pick up the return address
1120   __ pop(rax);
1121 
1122   // Convert 4-byte c2 stack slots to words.
1123   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1124 
1125   if (comp_args_on_stack) {
1126     __ subptr(rsp, comp_words_on_stack * wordSize);
1127   }
1128 
1129   // Ensure compiled code always sees stack at proper alignment
1130   __ andptr(rsp, -16);
1131 
1132   // push the return address and misalign the stack that youngest frame always sees
1133   // as far as the placement of the call instruction
1134   __ push(rax);
1135 
1136   // Put saved SP in another register
1137   const Register saved_sp = rax;
1138   __ movptr(saved_sp, r11);
1139 
1140   // Will jump to the compiled code just as if compiled code was doing it.
1141   // Pre-load the register-jump target early, to schedule it better.
1142   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1143 
1144 #if INCLUDE_JVMCI
1145   if (EnableJVMCI) {
1146     // check if this call should be routed towards a specific entry point
1147     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1148     Label no_alternative_target;
1149     __ jcc(Assembler::equal, no_alternative_target);
1150     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1151     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1152     __ bind(no_alternative_target);
1153   }
1154 #endif // INCLUDE_JVMCI
1155 
1156   int total_args_passed = sig->length();
1157 
1158   // Now generate the shuffle code.  Pick up all register args and move the
1159   // rest through the floating point stack top.
1160   for (int i = 0; i < total_args_passed; i++) {
1161     BasicType bt = sig->at(i)._bt;
1162     if (bt == T_VOID) {
1163       // Longs and doubles are passed in native word order, but misaligned
1164       // in the 32-bit build.
1165       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1166       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1167       continue;
1168     }
1169 
1170     // Pick up 0, 1 or 2 words from SP+offset.
1171 
1172     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1173             "scrambled load targets?");
1174     // Load in argument order going down.
1175     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1176     // Point to interpreter value (vs. tag)
1177     int next_off = ld_off - Interpreter::stackElementSize;
1178     //
1179     //
1180     //
1181     VMReg r_1 = regs[i].first();
1182     VMReg r_2 = regs[i].second();
1183     if (!r_1->is_valid()) {
1184       assert(!r_2->is_valid(), "");
1185       continue;
1186     }
1187     if (r_1->is_stack()) {
1188       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1189       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1190 
1191       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1192       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1193       // will be generated.
1194       if (!r_2->is_valid()) {
1195         // sign extend???
1196         __ movl(r13, Address(saved_sp, ld_off));
1197         __ movptr(Address(rsp, st_off), r13);
1198       } else {
1199         //
1200         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1201         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1202         // So we must adjust where to pick up the data to match the interpreter.
1203         //
1204         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1205         // are accessed as negative so LSW is at LOW address
1206 
1207         // ld_off is MSW so get LSW
1208         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1209                            next_off : ld_off;
1210         __ movq(r13, Address(saved_sp, offset));
1211         // st_off is LSW (i.e. reg.first())
1212         __ movq(Address(rsp, st_off), r13);
1213       }
1214     } else if (r_1->is_Register()) {  // Register argument
1215       Register r = r_1->as_Register();
1216       assert(r != rax, "must be different");
1217       if (r_2->is_valid()) {
1218         //
1219         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1220         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1221         // So we must adjust where to pick up the data to match the interpreter.
1222 
1223         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1224                            next_off : ld_off;
1225 
1226         // this can be a misaligned move
1227         __ movq(r, Address(saved_sp, offset));
1228       } else {
1229         // sign extend and use a full word?
1230         __ movl(r, Address(saved_sp, ld_off));
1231       }
1232     } else {
1233       if (!r_2->is_valid()) {
1234         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1235       } else {
1236         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1237       }
1238     }
1239   }
1240 
1241   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1242 
1243   // 6243940 We might end up in handle_wrong_method if
1244   // the callee is deoptimized as we race thru here. If that
1245   // happens we don't want to take a safepoint because the
1246   // caller frame will look interpreted and arguments are now
1247   // "compiled" so it is much better to make this transition
1248   // invisible to the stack walking code. Unfortunately if
1249   // we try and find the callee by normal means a safepoint
1250   // is possible. So we stash the desired callee in the thread
1251   // and the vm will find there should this case occur.
1252 
1253   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1254 
1255   // put Method* where a c2i would expect should we end up there
1256   // only needed because of c2 resolve stubs return Method* as a result in
1257   // rax
1258   __ mov(rax, rbx);
1259   __ jmp(r11);
1260 }
1261 
1262 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1263   Register data = rax;
1264   __ ic_check(1 /* end_alignment */);
1265   __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1266 
1267   // Method might have been compiled since the call site was patched to
1268   // interpreted if that is the case treat it as a miss so we can get
1269   // the call site corrected.
1270   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1271   __ jcc(Assembler::equal, skip_fixup);
1272   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1273 }
1274 
1275 // ---------------------------------------------------------------
1276 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1277                                             int comp_args_on_stack,
1278                                             const GrowableArray<SigEntry>* sig,
1279                                             const VMRegPair* regs,
1280                                             const GrowableArray<SigEntry>* sig_cc,
1281                                             const VMRegPair* regs_cc,
1282                                             const GrowableArray<SigEntry>* sig_cc_ro,
1283                                             const VMRegPair* regs_cc_ro,
1284                                             AdapterHandlerEntry* handler,
1285                                             AdapterBlob*& new_adapter,
1286                                             bool allocate_code_blob) {
1287   address i2c_entry = __ pc();
1288   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1289 
1290   // -------------------------------------------------------------------------
1291   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1292   // to the interpreter.  The args start out packed in the compiled layout.  They
1293   // need to be unpacked into the interpreter layout.  This will almost always
1294   // require some stack space.  We grow the current (compiled) stack, then repack
1295   // the args.  We  finally end in a jump to the generic interpreter entry point.
1296   // On exit from the interpreter, the interpreter will restore our SP (lest the
1297   // compiled code, which relies solely on SP and not RBP, get sick).
1298 
1299   address c2i_unverified_entry        = __ pc();
1300   address c2i_unverified_inline_entry = __ pc();
1301   Label skip_fixup;
1302 
1303   gen_inline_cache_check(masm, skip_fixup);
1304 
1305   OopMapSet* oop_maps = new OopMapSet();
1306   int frame_complete = CodeOffsets::frame_never_safe;
1307   int frame_size_in_words = 0;
1308 
1309   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1310   address c2i_no_clinit_check_entry = nullptr;
1311   address c2i_inline_ro_entry = __ pc();
1312   if (regs_cc != regs_cc_ro) {
1313     // No class init barrier needed because method is guaranteed to be non-static
1314     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, c2i_no_clinit_check_entry,
1315                     skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1316     skip_fixup.reset();
1317   }
1318 
1319   // Scalarized c2i adapter
1320   address c2i_entry        = __ pc();
1321   address c2i_inline_entry = __ pc();
1322   gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1323                   skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1324 
1325   // Non-scalarized c2i adapter
1326   if (regs != regs_cc) {
1327     c2i_unverified_inline_entry = __ pc();
1328     Label inline_entry_skip_fixup;
1329     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1330 
1331     c2i_inline_entry = __ pc();
1332     gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1333                     inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1334   }
1335 
1336   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1337   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1338   if (allocate_code_blob) {
1339     bool caller_must_gc_arguments = (regs != regs_cc);
1340     new_adapter = AdapterBlob::create(masm->code(), frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1341   }
1342 
1343   handler->set_entry_points(i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry,
1344                             c2i_unverified_inline_entry, c2i_no_clinit_check_entry);
1345 }
1346 
1347 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1348                                          VMRegPair *regs,
1349                                          int total_args_passed) {
1350 
1351 // We return the amount of VMRegImpl stack slots we need to reserve for all
1352 // the arguments NOT counting out_preserve_stack_slots.
1353 
1354 // NOTE: These arrays will have to change when c1 is ported
1355 #ifdef _WIN64
1356     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1357       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1358     };
1359     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1360       c_farg0, c_farg1, c_farg2, c_farg3
1361     };
1362 #else
1363     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1364       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1365     };
1366     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1367       c_farg0, c_farg1, c_farg2, c_farg3,
1368       c_farg4, c_farg5, c_farg6, c_farg7
1369     };
1370 #endif // _WIN64
1371 
1372 
1373     uint int_args = 0;
1374     uint fp_args = 0;
1375     uint stk_args = 0; // inc by 2 each time
1376 
1377     for (int i = 0; i < total_args_passed; i++) {
1378       switch (sig_bt[i]) {
1379       case T_BOOLEAN:
1380       case T_CHAR:
1381       case T_BYTE:
1382       case T_SHORT:
1383       case T_INT:
1384         if (int_args < Argument::n_int_register_parameters_c) {
1385           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1386 #ifdef _WIN64
1387           fp_args++;
1388           // Allocate slots for callee to stuff register args the stack.
1389           stk_args += 2;
1390 #endif
1391         } else {
1392           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1393           stk_args += 2;
1394         }
1395         break;
1396       case T_LONG:
1397         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1398         // fall through
1399       case T_OBJECT:
1400       case T_ARRAY:
1401       case T_ADDRESS:
1402       case T_METADATA:
1403         if (int_args < Argument::n_int_register_parameters_c) {
1404           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1405 #ifdef _WIN64
1406           fp_args++;
1407           stk_args += 2;
1408 #endif
1409         } else {
1410           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1411           stk_args += 2;
1412         }
1413         break;
1414       case T_FLOAT:
1415         if (fp_args < Argument::n_float_register_parameters_c) {
1416           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1417 #ifdef _WIN64
1418           int_args++;
1419           // Allocate slots for callee to stuff register args the stack.
1420           stk_args += 2;
1421 #endif
1422         } else {
1423           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1424           stk_args += 2;
1425         }
1426         break;
1427       case T_DOUBLE:
1428         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1429         if (fp_args < Argument::n_float_register_parameters_c) {
1430           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1431 #ifdef _WIN64
1432           int_args++;
1433           // Allocate slots for callee to stuff register args the stack.
1434           stk_args += 2;
1435 #endif
1436         } else {
1437           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1438           stk_args += 2;
1439         }
1440         break;
1441       case T_VOID: // Halves of longs and doubles
1442         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1443         regs[i].set_bad();
1444         break;
1445       default:
1446         ShouldNotReachHere();
1447         break;
1448       }
1449     }
1450 #ifdef _WIN64
1451   // windows abi requires that we always allocate enough stack space
1452   // for 4 64bit registers to be stored down.
1453   if (stk_args < 8) {
1454     stk_args = 8;
1455   }
1456 #endif // _WIN64
1457 
1458   return stk_args;
1459 }
1460 
1461 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1462                                              uint num_bits,
1463                                              uint total_args_passed) {
1464   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1465          "only certain vector sizes are supported for now");
1466 
1467   static const XMMRegister VEC_ArgReg[32] = {
1468      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1469      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1470     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1471     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1472   };
1473 
1474   uint stk_args = 0;
1475   uint fp_args = 0;
1476 
1477   for (uint i = 0; i < total_args_passed; i++) {
1478     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1479     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1480     regs[i].set_pair(vmreg->next(next_val), vmreg);
1481   }
1482 
1483   return stk_args;
1484 }
1485 
1486 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1487   // We always ignore the frame_slots arg and just use the space just below frame pointer
1488   // which by this time is free to use
1489   switch (ret_type) {
1490   case T_FLOAT:
1491     __ movflt(Address(rbp, -wordSize), xmm0);
1492     break;
1493   case T_DOUBLE:
1494     __ movdbl(Address(rbp, -wordSize), xmm0);
1495     break;
1496   case T_VOID:  break;
1497   default: {
1498     __ movptr(Address(rbp, -wordSize), rax);
1499     }
1500   }
1501 }
1502 
1503 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1504   // We always ignore the frame_slots arg and just use the space just below frame pointer
1505   // which by this time is free to use
1506   switch (ret_type) {
1507   case T_FLOAT:
1508     __ movflt(xmm0, Address(rbp, -wordSize));
1509     break;
1510   case T_DOUBLE:
1511     __ movdbl(xmm0, Address(rbp, -wordSize));
1512     break;
1513   case T_VOID:  break;
1514   default: {
1515     __ movptr(rax, Address(rbp, -wordSize));
1516     }
1517   }
1518 }
1519 
1520 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1521     for ( int i = first_arg ; i < arg_count ; i++ ) {
1522       if (args[i].first()->is_Register()) {
1523         __ push(args[i].first()->as_Register());
1524       } else if (args[i].first()->is_XMMRegister()) {
1525         __ subptr(rsp, 2*wordSize);
1526         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1527       }
1528     }
1529 }
1530 
1531 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1532     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1533       if (args[i].first()->is_Register()) {
1534         __ pop(args[i].first()->as_Register());
1535       } else if (args[i].first()->is_XMMRegister()) {
1536         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1537         __ addptr(rsp, 2*wordSize);
1538       }
1539     }
1540 }
1541 
1542 static void verify_oop_args(MacroAssembler* masm,
1543                             const methodHandle& method,
1544                             const BasicType* sig_bt,
1545                             const VMRegPair* regs) {
1546   Register temp_reg = rbx;  // not part of any compiled calling seq
1547   if (VerifyOops) {
1548     for (int i = 0; i < method->size_of_parameters(); i++) {
1549       if (is_reference_type(sig_bt[i])) {
1550         VMReg r = regs[i].first();
1551         assert(r->is_valid(), "bad oop arg");
1552         if (r->is_stack()) {
1553           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1554           __ verify_oop(temp_reg);
1555         } else {
1556           __ verify_oop(r->as_Register());
1557         }
1558       }
1559     }
1560   }
1561 }
1562 
1563 static void check_continuation_enter_argument(VMReg actual_vmreg,
1564                                               Register expected_reg,
1565                                               const char* name) {
1566   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1567   assert(actual_vmreg->as_Register() == expected_reg,
1568          "%s is in unexpected register: %s instead of %s",
1569          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1570 }
1571 
1572 
1573 //---------------------------- continuation_enter_setup ---------------------------
1574 //
1575 // Arguments:
1576 //   None.
1577 //
1578 // Results:
1579 //   rsp: pointer to blank ContinuationEntry
1580 //
1581 // Kills:
1582 //   rax
1583 //
1584 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1585   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1586   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1587   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1588 
1589   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1590   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1591 
1592   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1593   OopMap* map = new OopMap(frame_size, 0);
1594 
1595   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1596   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1597   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1598 
1599   return map;
1600 }
1601 
1602 //---------------------------- fill_continuation_entry ---------------------------
1603 //
1604 // Arguments:
1605 //   rsp: pointer to blank Continuation entry
1606 //   reg_cont_obj: pointer to the continuation
1607 //   reg_flags: flags
1608 //
1609 // Results:
1610 //   rsp: pointer to filled out ContinuationEntry
1611 //
1612 // Kills:
1613 //   rax
1614 //
1615 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1616   assert_different_registers(rax, reg_cont_obj, reg_flags);
1617 #ifdef ASSERT
1618   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1619 #endif
1620   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1621   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1622   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1623   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1624   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1625 
1626   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1627   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1628   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1629   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1630 
1631   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1632   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1633 }
1634 
1635 //---------------------------- continuation_enter_cleanup ---------------------------
1636 //
1637 // Arguments:
1638 //   rsp: pointer to the ContinuationEntry
1639 //
1640 // Results:
1641 //   rsp: pointer to the spilled rbp in the entry frame
1642 //
1643 // Kills:
1644 //   rbx
1645 //
1646 static void continuation_enter_cleanup(MacroAssembler* masm) {
1647 #ifdef ASSERT
1648   Label L_good_sp;
1649   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1650   __ jcc(Assembler::equal, L_good_sp);
1651   __ stop("Incorrect rsp at continuation_enter_cleanup");
1652   __ bind(L_good_sp);
1653 #endif
1654   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1655   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1656 
1657   if (CheckJNICalls) {
1658     // Check if this is a virtual thread continuation
1659     Label L_skip_vthread_code;
1660     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1661     __ jcc(Assembler::equal, L_skip_vthread_code);
1662 
1663     // If the held monitor count is > 0 and this vthread is terminating then
1664     // it failed to release a JNI monitor. So we issue the same log message
1665     // that JavaThread::exit does.
1666     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1667     __ jcc(Assembler::equal, L_skip_vthread_code);
1668 
1669     // rax may hold an exception oop, save it before the call
1670     __ push(rax);
1671     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1672     __ pop(rax);
1673 
1674     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1675     // on termination. The held count is implicitly zeroed below when we restore from
1676     // the parent held count (which has to be zero).
1677     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1678 
1679     __ bind(L_skip_vthread_code);
1680   }
1681 #ifdef ASSERT
1682   else {
1683     // Check if this is a virtual thread continuation
1684     Label L_skip_vthread_code;
1685     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1686     __ jcc(Assembler::equal, L_skip_vthread_code);
1687 
1688     // See comment just above. If not checking JNI calls the JNI count is only
1689     // needed for assertion checking.
1690     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1691 
1692     __ bind(L_skip_vthread_code);
1693   }
1694 #endif
1695 
1696   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1697   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1698 
1699   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1700   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1701   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1702 }
1703 
1704 static void gen_continuation_enter(MacroAssembler* masm,
1705                                    const VMRegPair* regs,
1706                                    int& exception_offset,
1707                                    OopMapSet* oop_maps,
1708                                    int& frame_complete,
1709                                    int& stack_slots,
1710                                    int& interpreted_entry_offset,
1711                                    int& compiled_entry_offset) {
1712 
1713   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1714   int pos_cont_obj   = 0;
1715   int pos_is_cont    = 1;
1716   int pos_is_virtual = 2;
1717 
1718   // The platform-specific calling convention may present the arguments in various registers.
1719   // To simplify the rest of the code, we expect the arguments to reside at these known
1720   // registers, and we additionally check the placement here in case calling convention ever
1721   // changes.
1722   Register reg_cont_obj   = c_rarg1;
1723   Register reg_is_cont    = c_rarg2;
1724   Register reg_is_virtual = c_rarg3;
1725 
1726   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1727   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1728   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1729 
1730   // Utility methods kill rax, make sure there are no collisions
1731   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1732 
1733   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1734                          relocInfo::static_call_type);
1735 
1736   address start = __ pc();
1737 
1738   Label L_thaw, L_exit;
1739 
1740   // i2i entry used at interp_only_mode only
1741   interpreted_entry_offset = __ pc() - start;
1742   {
1743 #ifdef ASSERT
1744     Label is_interp_only;
1745     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1746     __ jcc(Assembler::notEqual, is_interp_only);
1747     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1748     __ bind(is_interp_only);
1749 #endif
1750 
1751     __ pop(rax); // return address
1752     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1753     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1754     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1755     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1756     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1757     __ push(rax); // return address
1758     __ push_cont_fastpath();
1759 
1760     __ enter();
1761 
1762     stack_slots = 2; // will be adjusted in setup
1763     OopMap* map = continuation_enter_setup(masm, stack_slots);
1764     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1765     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1766 
1767     __ verify_oop(reg_cont_obj);
1768 
1769     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1770 
1771     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1772     __ testptr(reg_is_cont, reg_is_cont);
1773     __ jcc(Assembler::notZero, L_thaw);
1774 
1775     // --- Resolve path
1776 
1777     // Make sure the call is patchable
1778     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1779     // Emit stub for static call
1780     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1781     if (stub == nullptr) {
1782       fatal("CodeCache is full at gen_continuation_enter");
1783     }
1784     __ call(resolve);
1785     oop_maps->add_gc_map(__ pc() - start, map);
1786     __ post_call_nop();
1787 
1788     __ jmp(L_exit);
1789   }
1790 
1791   // compiled entry
1792   __ align(CodeEntryAlignment);
1793   compiled_entry_offset = __ pc() - start;
1794   __ enter();
1795 
1796   stack_slots = 2; // will be adjusted in setup
1797   OopMap* map = continuation_enter_setup(masm, stack_slots);
1798 
1799   // Frame is now completed as far as size and linkage.
1800   frame_complete = __ pc() - start;
1801 
1802   __ verify_oop(reg_cont_obj);
1803 
1804   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1805 
1806   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1807   __ testptr(reg_is_cont, reg_is_cont);
1808   __ jccb(Assembler::notZero, L_thaw);
1809 
1810   // --- call Continuation.enter(Continuation c, boolean isContinue)
1811 
1812   // Make sure the call is patchable
1813   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1814 
1815   // Emit stub for static call
1816   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1817   if (stub == nullptr) {
1818     fatal("CodeCache is full at gen_continuation_enter");
1819   }
1820 
1821   // The call needs to be resolved. There's a special case for this in
1822   // SharedRuntime::find_callee_info_helper() which calls
1823   // LinkResolver::resolve_continuation_enter() which resolves the call to
1824   // Continuation.enter(Continuation c, boolean isContinue).
1825   __ call(resolve);
1826 
1827   oop_maps->add_gc_map(__ pc() - start, map);
1828   __ post_call_nop();
1829 
1830   __ jmpb(L_exit);
1831 
1832   // --- Thawing path
1833 
1834   __ bind(L_thaw);
1835 
1836   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1837   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1838 
1839   ContinuationEntry::_return_pc_offset = __ pc() - start;
1840   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1841   __ post_call_nop();
1842 
1843   // --- Normal exit (resolve/thawing)
1844 
1845   __ bind(L_exit);
1846   ContinuationEntry::_cleanup_offset = __ pc() - start;
1847   continuation_enter_cleanup(masm);
1848   __ pop(rbp);
1849   __ ret(0);
1850 
1851   // --- Exception handling path
1852 
1853   exception_offset = __ pc() - start;
1854 
1855   continuation_enter_cleanup(masm);
1856   __ pop(rbp);
1857 
1858   __ movptr(c_rarg0, r15_thread);
1859   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1860 
1861   // rax still holds the original exception oop, save it before the call
1862   __ push(rax);
1863 
1864   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1865   __ movptr(rbx, rax);
1866 
1867   // Continue at exception handler:
1868   //   rax: exception oop
1869   //   rbx: exception handler
1870   //   rdx: exception pc
1871   __ pop(rax);
1872   __ verify_oop(rax);
1873   __ pop(rdx);
1874   __ jmp(rbx);
1875 }
1876 
1877 static void gen_continuation_yield(MacroAssembler* masm,
1878                                    const VMRegPair* regs,
1879                                    OopMapSet* oop_maps,
1880                                    int& frame_complete,
1881                                    int& stack_slots,
1882                                    int& compiled_entry_offset) {
1883   enum layout {
1884     rbp_off,
1885     rbpH_off,
1886     return_off,
1887     return_off2,
1888     framesize // inclusive of return address
1889   };
1890   stack_slots = framesize /  VMRegImpl::slots_per_word;
1891   assert(stack_slots == 2, "recheck layout");
1892 
1893   address start = __ pc();
1894   compiled_entry_offset = __ pc() - start;
1895   __ enter();
1896   address the_pc = __ pc();
1897 
1898   frame_complete = the_pc - start;
1899 
1900   // This nop must be exactly at the PC we push into the frame info.
1901   // We use this nop for fast CodeBlob lookup, associate the OopMap
1902   // with it right away.
1903   __ post_call_nop();
1904   OopMap* map = new OopMap(framesize, 1);
1905   oop_maps->add_gc_map(frame_complete, map);
1906 
1907   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1908   __ movptr(c_rarg0, r15_thread);
1909   __ movptr(c_rarg1, rsp);
1910   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1911   __ reset_last_Java_frame(true);
1912 
1913   Label L_pinned;
1914 
1915   __ testptr(rax, rax);
1916   __ jcc(Assembler::notZero, L_pinned);
1917 
1918   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1919   continuation_enter_cleanup(masm);
1920   __ pop(rbp);
1921   __ ret(0);
1922 
1923   __ bind(L_pinned);
1924 
1925   // Pinned, return to caller
1926 
1927   // handle pending exception thrown by freeze
1928   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1929   Label ok;
1930   __ jcc(Assembler::equal, ok);
1931   __ leave();
1932   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1933   __ bind(ok);
1934 
1935   __ leave();
1936   __ ret(0);
1937 }
1938 
1939 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1940   ::continuation_enter_cleanup(masm);
1941 }
1942 
1943 static void gen_special_dispatch(MacroAssembler* masm,
1944                                  const methodHandle& method,
1945                                  const BasicType* sig_bt,
1946                                  const VMRegPair* regs) {
1947   verify_oop_args(masm, method, sig_bt, regs);
1948   vmIntrinsics::ID iid = method->intrinsic_id();
1949 
1950   // Now write the args into the outgoing interpreter space
1951   bool     has_receiver   = false;
1952   Register receiver_reg   = noreg;
1953   int      member_arg_pos = -1;
1954   Register member_reg     = noreg;
1955   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1956   if (ref_kind != 0) {
1957     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1958     member_reg = rbx;  // known to be free at this point
1959     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1960   } else if (iid == vmIntrinsics::_invokeBasic) {
1961     has_receiver = true;
1962   } else if (iid == vmIntrinsics::_linkToNative) {
1963     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1964     member_reg = rbx;  // known to be free at this point
1965   } else {
1966     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1967   }
1968 
1969   if (member_reg != noreg) {
1970     // Load the member_arg into register, if necessary.
1971     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1972     VMReg r = regs[member_arg_pos].first();
1973     if (r->is_stack()) {
1974       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1975     } else {
1976       // no data motion is needed
1977       member_reg = r->as_Register();
1978     }
1979   }
1980 
1981   if (has_receiver) {
1982     // Make sure the receiver is loaded into a register.
1983     assert(method->size_of_parameters() > 0, "oob");
1984     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1985     VMReg r = regs[0].first();
1986     assert(r->is_valid(), "bad receiver arg");
1987     if (r->is_stack()) {
1988       // Porting note:  This assumes that compiled calling conventions always
1989       // pass the receiver oop in a register.  If this is not true on some
1990       // platform, pick a temp and load the receiver from stack.
1991       fatal("receiver always in a register");
1992       receiver_reg = j_rarg0;  // known to be free at this point
1993       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1994     } else {
1995       // no data motion is needed
1996       receiver_reg = r->as_Register();
1997     }
1998   }
1999 
2000   // Figure out which address we are really jumping to:
2001   MethodHandles::generate_method_handle_dispatch(masm, iid,
2002                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
2003 }
2004 
2005 // ---------------------------------------------------------------------------
2006 // Generate a native wrapper for a given method.  The method takes arguments
2007 // in the Java compiled code convention, marshals them to the native
2008 // convention (handlizes oops, etc), transitions to native, makes the call,
2009 // returns to java state (possibly blocking), unhandlizes any result and
2010 // returns.
2011 //
2012 // Critical native functions are a shorthand for the use of
2013 // GetPrimtiveArrayCritical and disallow the use of any other JNI
2014 // functions.  The wrapper is expected to unpack the arguments before
2015 // passing them to the callee. Critical native functions leave the state _in_Java,
2016 // since they cannot stop for GC.
2017 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
2018 // block and the check for pending exceptions it's impossible for them
2019 // to be thrown.
2020 //
2021 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
2022                                                 const methodHandle& method,
2023                                                 int compile_id,
2024                                                 BasicType* in_sig_bt,
2025                                                 VMRegPair* in_regs,
2026                                                 BasicType ret_type) {
2027   if (method->is_continuation_native_intrinsic()) {
2028     int exception_offset = -1;
2029     OopMapSet* oop_maps = new OopMapSet();
2030     int frame_complete = -1;
2031     int stack_slots = -1;
2032     int interpreted_entry_offset = -1;
2033     int vep_offset = -1;
2034     if (method->is_continuation_enter_intrinsic()) {
2035       gen_continuation_enter(masm,
2036                              in_regs,
2037                              exception_offset,
2038                              oop_maps,
2039                              frame_complete,
2040                              stack_slots,
2041                              interpreted_entry_offset,
2042                              vep_offset);
2043     } else if (method->is_continuation_yield_intrinsic()) {
2044       gen_continuation_yield(masm,
2045                              in_regs,
2046                              oop_maps,
2047                              frame_complete,
2048                              stack_slots,
2049                              vep_offset);
2050     } else {
2051       guarantee(false, "Unknown Continuation native intrinsic");
2052     }
2053 
2054 #ifdef ASSERT
2055     if (method->is_continuation_enter_intrinsic()) {
2056       assert(interpreted_entry_offset != -1, "Must be set");
2057       assert(exception_offset != -1,         "Must be set");
2058     } else {
2059       assert(interpreted_entry_offset == -1, "Must be unset");
2060       assert(exception_offset == -1,         "Must be unset");
2061     }
2062     assert(frame_complete != -1,    "Must be set");
2063     assert(stack_slots != -1,       "Must be set");
2064     assert(vep_offset != -1,        "Must be set");
2065 #endif
2066 
2067     __ flush();
2068     nmethod* nm = nmethod::new_native_nmethod(method,
2069                                               compile_id,
2070                                               masm->code(),
2071                                               vep_offset,
2072                                               frame_complete,
2073                                               stack_slots,
2074                                               in_ByteSize(-1),
2075                                               in_ByteSize(-1),
2076                                               oop_maps,
2077                                               exception_offset);
2078     if (nm == nullptr) return nm;
2079     if (method->is_continuation_enter_intrinsic()) {
2080       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2081     } else if (method->is_continuation_yield_intrinsic()) {
2082       _cont_doYield_stub = nm;
2083     }
2084     return nm;
2085   }
2086 
2087   if (method->is_method_handle_intrinsic()) {
2088     vmIntrinsics::ID iid = method->intrinsic_id();
2089     intptr_t start = (intptr_t)__ pc();
2090     int vep_offset = ((intptr_t)__ pc()) - start;
2091     gen_special_dispatch(masm,
2092                          method,
2093                          in_sig_bt,
2094                          in_regs);
2095     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
2096     __ flush();
2097     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
2098     return nmethod::new_native_nmethod(method,
2099                                        compile_id,
2100                                        masm->code(),
2101                                        vep_offset,
2102                                        frame_complete,
2103                                        stack_slots / VMRegImpl::slots_per_word,
2104                                        in_ByteSize(-1),
2105                                        in_ByteSize(-1),
2106                                        nullptr);
2107   }
2108   address native_func = method->native_function();
2109   assert(native_func != nullptr, "must have function");
2110 
2111   // An OopMap for lock (and class if static)
2112   OopMapSet *oop_maps = new OopMapSet();
2113   intptr_t start = (intptr_t)__ pc();
2114 
2115   // We have received a description of where all the java arg are located
2116   // on entry to the wrapper. We need to convert these args to where
2117   // the jni function will expect them. To figure out where they go
2118   // we convert the java signature to a C signature by inserting
2119   // the hidden arguments as arg[0] and possibly arg[1] (static method)
2120 
2121   const int total_in_args = method->size_of_parameters();
2122   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2123 
2124   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2125   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2126 
2127   int argc = 0;
2128   out_sig_bt[argc++] = T_ADDRESS;
2129   if (method->is_static()) {
2130     out_sig_bt[argc++] = T_OBJECT;
2131   }
2132 
2133   for (int i = 0; i < total_in_args ; i++ ) {
2134     out_sig_bt[argc++] = in_sig_bt[i];
2135   }
2136 
2137   // Now figure out where the args must be stored and how much stack space
2138   // they require.
2139   int out_arg_slots;
2140   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2141 
2142   // Compute framesize for the wrapper.  We need to handlize all oops in
2143   // incoming registers
2144 
2145   // Calculate the total number of stack slots we will need.
2146 
2147   // First count the abi requirement plus all of the outgoing args
2148   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2149 
2150   // Now the space for the inbound oop handle area
2151   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
2152 
2153   int oop_handle_offset = stack_slots;
2154   stack_slots += total_save_slots;
2155 
2156   // Now any space we need for handlizing a klass if static method
2157 
2158   int klass_slot_offset = 0;
2159   int klass_offset = -1;
2160   int lock_slot_offset = 0;
2161   bool is_static = false;
2162 
2163   if (method->is_static()) {
2164     klass_slot_offset = stack_slots;
2165     stack_slots += VMRegImpl::slots_per_word;
2166     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2167     is_static = true;
2168   }
2169 
2170   // Plus a lock if needed
2171 
2172   if (method->is_synchronized()) {
2173     lock_slot_offset = stack_slots;
2174     stack_slots += VMRegImpl::slots_per_word;
2175   }
2176 
2177   // Now a place (+2) to save return values or temp during shuffling
2178   // + 4 for return address (which we own) and saved rbp
2179   stack_slots += 6;
2180 
2181   // Ok The space we have allocated will look like:
2182   //
2183   //
2184   // FP-> |                     |
2185   //      |---------------------|
2186   //      | 2 slots for moves   |
2187   //      |---------------------|
2188   //      | lock box (if sync)  |
2189   //      |---------------------| <- lock_slot_offset
2190   //      | klass (if static)   |
2191   //      |---------------------| <- klass_slot_offset
2192   //      | oopHandle area      |
2193   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2194   //      | outbound memory     |
2195   //      | based arguments     |
2196   //      |                     |
2197   //      |---------------------|
2198   //      |                     |
2199   // SP-> | out_preserved_slots |
2200   //
2201   //
2202 
2203 
2204   // Now compute actual number of stack words we need rounding to make
2205   // stack properly aligned.
2206   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2207 
2208   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2209 
2210   // First thing make an ic check to see if we should even be here
2211 
2212   // We are free to use all registers as temps without saving them and
2213   // restoring them except rbp. rbp is the only callee save register
2214   // as far as the interpreter and the compiler(s) are concerned.
2215 
2216   const Register receiver = j_rarg0;
2217 
2218   Label exception_pending;
2219 
2220   assert_different_registers(receiver, rscratch1, rscratch2);
2221   __ verify_oop(receiver);
2222   __ ic_check(8 /* end_alignment */);
2223 
2224   int vep_offset = ((intptr_t)__ pc()) - start;
2225 
2226   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2227     Label L_skip_barrier;
2228     Register klass = r10;
2229     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2230     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
2231 
2232     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2233 
2234     __ bind(L_skip_barrier);
2235   }
2236 
2237 #ifdef COMPILER1
2238   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2239   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2240     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2241   }
2242 #endif // COMPILER1
2243 
2244   // The instruction at the verified entry point must be 5 bytes or longer
2245   // because it can be patched on the fly by make_non_entrant. The stack bang
2246   // instruction fits that requirement.
2247 
2248   // Generate stack overflow check
2249   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2250 
2251   // Generate a new frame for the wrapper.
2252   __ enter();
2253   // -2 because return address is already present and so is saved rbp
2254   __ subptr(rsp, stack_size - 2*wordSize);
2255 
2256   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2257   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2258   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2259 
2260   // Frame is now completed as far as size and linkage.
2261   int frame_complete = ((intptr_t)__ pc()) - start;
2262 
2263 #ifdef ASSERT
2264   __ check_stack_alignment(rsp, "improperly aligned stack");
2265 #endif /* ASSERT */
2266 
2267 
2268   // We use r14 as the oop handle for the receiver/klass
2269   // It is callee save so it survives the call to native
2270 
2271   const Register oop_handle_reg = r14;
2272 
2273   //
2274   // We immediately shuffle the arguments so that any vm call we have to
2275   // make from here on out (sync slow path, jvmti, etc.) we will have
2276   // captured the oops from our caller and have a valid oopMap for
2277   // them.
2278 
2279   // -----------------
2280   // The Grand Shuffle
2281 
2282   // The Java calling convention is either equal (linux) or denser (win64) than the
2283   // c calling convention. However the because of the jni_env argument the c calling
2284   // convention always has at least one more (and two for static) arguments than Java.
2285   // Therefore if we move the args from java -> c backwards then we will never have
2286   // a register->register conflict and we don't have to build a dependency graph
2287   // and figure out how to break any cycles.
2288   //
2289 
2290   // Record esp-based slot for receiver on stack for non-static methods
2291   int receiver_offset = -1;
2292 
2293   // This is a trick. We double the stack slots so we can claim
2294   // the oops in the caller's frame. Since we are sure to have
2295   // more args than the caller doubling is enough to make
2296   // sure we can capture all the incoming oop args from the
2297   // caller.
2298   //
2299   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2300 
2301   // Mark location of rbp (someday)
2302   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2303 
2304   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2305   // All inbound args are referenced based on rbp and all outbound args via rsp.
2306 
2307 
2308 #ifdef ASSERT
2309   bool reg_destroyed[Register::number_of_registers];
2310   bool freg_destroyed[XMMRegister::number_of_registers];
2311   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2312     reg_destroyed[r] = false;
2313   }
2314   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2315     freg_destroyed[f] = false;
2316   }
2317 
2318 #endif /* ASSERT */
2319 
2320   // For JNI natives the incoming and outgoing registers are offset upwards.
2321   GrowableArray<int> arg_order(2 * total_in_args);
2322 
2323   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2324     arg_order.push(i);
2325     arg_order.push(c_arg);
2326   }
2327 
2328   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2329     int i = arg_order.at(ai);
2330     int c_arg = arg_order.at(ai + 1);
2331     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2332 #ifdef ASSERT
2333     if (in_regs[i].first()->is_Register()) {
2334       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2335     } else if (in_regs[i].first()->is_XMMRegister()) {
2336       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2337     }
2338     if (out_regs[c_arg].first()->is_Register()) {
2339       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2340     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2341       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2342     }
2343 #endif /* ASSERT */
2344     switch (in_sig_bt[i]) {
2345       case T_ARRAY:
2346       case T_OBJECT:
2347         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2348                     ((i == 0) && (!is_static)),
2349                     &receiver_offset);
2350         break;
2351       case T_VOID:
2352         break;
2353 
2354       case T_FLOAT:
2355         __ float_move(in_regs[i], out_regs[c_arg]);
2356           break;
2357 
2358       case T_DOUBLE:
2359         assert( i + 1 < total_in_args &&
2360                 in_sig_bt[i + 1] == T_VOID &&
2361                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2362         __ double_move(in_regs[i], out_regs[c_arg]);
2363         break;
2364 
2365       case T_LONG :
2366         __ long_move(in_regs[i], out_regs[c_arg]);
2367         break;
2368 
2369       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2370 
2371       default:
2372         __ move32_64(in_regs[i], out_regs[c_arg]);
2373     }
2374   }
2375 
2376   int c_arg;
2377 
2378   // Pre-load a static method's oop into r14.  Used both by locking code and
2379   // the normal JNI call code.
2380   // point c_arg at the first arg that is already loaded in case we
2381   // need to spill before we call out
2382   c_arg = total_c_args - total_in_args;
2383 
2384   if (method->is_static()) {
2385 
2386     //  load oop into a register
2387     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2388 
2389     // Now handlize the static class mirror it's known not-null.
2390     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2391     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2392 
2393     // Now get the handle
2394     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2395     // store the klass handle as second argument
2396     __ movptr(c_rarg1, oop_handle_reg);
2397     // and protect the arg if we must spill
2398     c_arg--;
2399   }
2400 
2401   // Change state to native (we save the return address in the thread, since it might not
2402   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2403   // points into the right code segment. It does not have to be the correct return pc.
2404   // We use the same pc/oopMap repeatedly when we call out
2405 
2406   Label native_return;
2407   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2408     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2409     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2410   } else {
2411     intptr_t the_pc = (intptr_t) __ pc();
2412     oop_maps->add_gc_map(the_pc - start, map);
2413 
2414     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2415   }
2416 
2417   // We have all of the arguments setup at this point. We must not touch any register
2418   // argument registers at this point (what if we save/restore them there are no oop?
2419 
2420   if (DTraceMethodProbes) {
2421     // protect the args we've loaded
2422     save_args(masm, total_c_args, c_arg, out_regs);
2423     __ mov_metadata(c_rarg1, method());
2424     __ call_VM_leaf(
2425       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2426       r15_thread, c_rarg1);
2427     restore_args(masm, total_c_args, c_arg, out_regs);
2428   }
2429 
2430   // RedefineClasses() tracing support for obsolete method entry
2431   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2432     // protect the args we've loaded
2433     save_args(masm, total_c_args, c_arg, out_regs);
2434     __ mov_metadata(c_rarg1, method());
2435     __ call_VM_leaf(
2436       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2437       r15_thread, c_rarg1);
2438     restore_args(masm, total_c_args, c_arg, out_regs);
2439   }
2440 
2441   // Lock a synchronized method
2442 
2443   // Register definitions used by locking and unlocking
2444 
2445   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2446   const Register obj_reg  = rbx;  // Will contain the oop
2447   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2448   const Register old_hdr  = r13;  // value of old header at unlock time
2449 
2450   Label slow_path_lock;
2451   Label lock_done;
2452 
2453   if (method->is_synchronized()) {
2454     Label count_mon;
2455 
2456     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2457 
2458     // Get the handle (the 2nd argument)
2459     __ mov(oop_handle_reg, c_rarg1);
2460 
2461     // Get address of the box
2462 
2463     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2464 
2465     // Load the oop from the handle
2466     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2467 
2468     if (LockingMode == LM_MONITOR) {
2469       __ jmp(slow_path_lock);
2470     } else if (LockingMode == LM_LEGACY) {
2471       // Load immediate 1 into swap_reg %rax
2472       __ movl(swap_reg, 1);
2473 
2474       // Load (object->mark() | 1) into swap_reg %rax
2475       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2476       if (EnableValhalla) {
2477         // Mask inline_type bit such that we go to the slow path if object is an inline type
2478         __ andptr(swap_reg, ~((int) markWord::inline_type_bit_in_place));
2479       }
2480 
2481       // Save (object->mark() | 1) into BasicLock's displaced header
2482       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2483 
2484       // src -> dest iff dest == rax else rax <- dest
2485       __ lock();
2486       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2487       __ jcc(Assembler::equal, count_mon);
2488 
2489       // Hmm should this move to the slow path code area???
2490 
2491       // Test if the oopMark is an obvious stack pointer, i.e.,
2492       //  1) (mark & 3) == 0, and
2493       //  2) rsp <= mark < mark + os::pagesize()
2494       // These 3 tests can be done by evaluating the following
2495       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2496       // assuming both stack pointer and pagesize have their
2497       // least significant 2 bits clear.
2498       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2499 
2500       __ subptr(swap_reg, rsp);
2501       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2502 
2503       // Save the test result, for recursive case, the result is zero
2504       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2505       __ jcc(Assembler::notEqual, slow_path_lock);
2506 
2507       __ bind(count_mon);
2508       __ inc_held_monitor_count();
2509     } else {
2510       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2511       __ lightweight_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2512     }
2513 
2514     // Slow path will re-enter here
2515     __ bind(lock_done);
2516   }
2517 
2518   // Finally just about ready to make the JNI call
2519 
2520   // get JNIEnv* which is first argument to native
2521   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2522 
2523   // Now set thread in native
2524   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2525 
2526   __ call(RuntimeAddress(native_func));
2527 
2528   // Verify or restore cpu control state after JNI call
2529   __ restore_cpu_control_state_after_jni(rscratch1);
2530 
2531   // Unpack native results.
2532   switch (ret_type) {
2533   case T_BOOLEAN: __ c2bool(rax);            break;
2534   case T_CHAR   : __ movzwl(rax, rax);      break;
2535   case T_BYTE   : __ sign_extend_byte (rax); break;
2536   case T_SHORT  : __ sign_extend_short(rax); break;
2537   case T_INT    : /* nothing to do */        break;
2538   case T_DOUBLE :
2539   case T_FLOAT  :
2540     // Result is in xmm0 we'll save as needed
2541     break;
2542   case T_ARRAY:                 // Really a handle
2543   case T_OBJECT:                // Really a handle
2544       break; // can't de-handlize until after safepoint check
2545   case T_VOID: break;
2546   case T_LONG: break;
2547   default       : ShouldNotReachHere();
2548   }
2549 
2550   // Switch thread to "native transition" state before reading the synchronization state.
2551   // This additional state is necessary because reading and testing the synchronization
2552   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2553   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2554   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2555   //     Thread A is resumed to finish this native method, but doesn't block here since it
2556   //     didn't see any synchronization is progress, and escapes.
2557   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2558 
2559   // Force this write out before the read below
2560   if (!UseSystemMemoryBarrier) {
2561     __ membar(Assembler::Membar_mask_bits(
2562               Assembler::LoadLoad | Assembler::LoadStore |
2563               Assembler::StoreLoad | Assembler::StoreStore));
2564   }
2565 
2566   // check for safepoint operation in progress and/or pending suspend requests
2567   {
2568     Label Continue;
2569     Label slow_path;
2570 
2571     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2572 
2573     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2574     __ jcc(Assembler::equal, Continue);
2575     __ bind(slow_path);
2576 
2577     // Don't use call_VM as it will see a possible pending exception and forward it
2578     // and never return here preventing us from clearing _last_native_pc down below.
2579     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2580     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2581     // by hand.
2582     //
2583     __ vzeroupper();
2584     save_native_result(masm, ret_type, stack_slots);
2585     __ mov(c_rarg0, r15_thread);
2586     __ mov(r12, rsp); // remember sp
2587     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2588     __ andptr(rsp, -16); // align stack as required by ABI
2589     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2590     __ mov(rsp, r12); // restore sp
2591     __ reinit_heapbase();
2592     // Restore any method result value
2593     restore_native_result(masm, ret_type, stack_slots);
2594     __ bind(Continue);
2595   }
2596 
2597   // change thread state
2598   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2599 
2600   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2601     // Check preemption for Object.wait()
2602     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2603     __ cmpptr(rscratch1, NULL_WORD);
2604     __ jccb(Assembler::equal, native_return);
2605     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2606     __ jmp(rscratch1);
2607     __ bind(native_return);
2608 
2609     intptr_t the_pc = (intptr_t) __ pc();
2610     oop_maps->add_gc_map(the_pc - start, map);
2611   }
2612 
2613 
2614   Label reguard;
2615   Label reguard_done;
2616   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2617   __ jcc(Assembler::equal, reguard);
2618   __ bind(reguard_done);
2619 
2620   // native result if any is live
2621 
2622   // Unlock
2623   Label slow_path_unlock;
2624   Label unlock_done;
2625   if (method->is_synchronized()) {
2626 
2627     Label fast_done;
2628 
2629     // Get locked oop from the handle we passed to jni
2630     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2631 
2632     if (LockingMode == LM_LEGACY) {
2633       Label not_recur;
2634       // Simple recursive lock?
2635       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2636       __ jcc(Assembler::notEqual, not_recur);
2637       __ dec_held_monitor_count();
2638       __ jmpb(fast_done);
2639       __ bind(not_recur);
2640     }
2641 
2642     // Must save rax if it is live now because cmpxchg must use it
2643     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2644       save_native_result(masm, ret_type, stack_slots);
2645     }
2646 
2647     if (LockingMode == LM_MONITOR) {
2648       __ jmp(slow_path_unlock);
2649     } else if (LockingMode == LM_LEGACY) {
2650       // get address of the stack lock
2651       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2652       //  get old displaced header
2653       __ movptr(old_hdr, Address(rax, 0));
2654 
2655       // Atomic swap old header if oop still contains the stack lock
2656       __ lock();
2657       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2658       __ jcc(Assembler::notEqual, slow_path_unlock);
2659       __ dec_held_monitor_count();
2660     } else {
2661       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2662       __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2663     }
2664 
2665     // slow path re-enters here
2666     __ bind(unlock_done);
2667     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2668       restore_native_result(masm, ret_type, stack_slots);
2669     }
2670 
2671     __ bind(fast_done);
2672   }
2673   if (DTraceMethodProbes) {
2674     save_native_result(masm, ret_type, stack_slots);
2675     __ mov_metadata(c_rarg1, method());
2676     __ call_VM_leaf(
2677          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2678          r15_thread, c_rarg1);
2679     restore_native_result(masm, ret_type, stack_slots);
2680   }
2681 
2682   __ reset_last_Java_frame(false);
2683 
2684   // Unbox oop result, e.g. JNIHandles::resolve value.
2685   if (is_reference_type(ret_type)) {
2686     __ resolve_jobject(rax /* value */,
2687                        rcx /* tmp */);
2688   }
2689 
2690   if (CheckJNICalls) {
2691     // clear_pending_jni_exception_check
2692     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2693   }
2694 
2695   // reset handle block
2696   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2697   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2698 
2699   // pop our frame
2700 
2701   __ leave();
2702 
2703 #if INCLUDE_JFR
2704   // We need to do a poll test after unwind in case the sampler
2705   // managed to sample the native frame after returning to Java.
2706   Label L_return;
2707   address poll_test_pc = __ pc();
2708   __ relocate(relocInfo::poll_return_type);
2709   __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2710   __ jccb(Assembler::zero, L_return);
2711   __ lea(rscratch1, InternalAddress(poll_test_pc));
2712   __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2713   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2714     "polling page return stub not created yet");
2715   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2716   __ jump(RuntimeAddress(stub));
2717   __ bind(L_return);
2718 #endif // INCLUDE_JFR
2719 
2720   // Any exception pending?
2721   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2722   __ jcc(Assembler::notEqual, exception_pending);
2723 
2724   // Return
2725 
2726   __ ret(0);
2727 
2728   // Unexpected paths are out of line and go here
2729 
2730   // forward the exception
2731   __ bind(exception_pending);
2732 
2733   // and forward the exception
2734   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2735 
2736   // Slow path locking & unlocking
2737   if (method->is_synchronized()) {
2738 
2739     // BEGIN Slow path lock
2740     __ bind(slow_path_lock);
2741 
2742     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2743     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2744 
2745     // protect the args we've loaded
2746     save_args(masm, total_c_args, c_arg, out_regs);
2747 
2748     __ mov(c_rarg0, obj_reg);
2749     __ mov(c_rarg1, lock_reg);
2750     __ mov(c_rarg2, r15_thread);
2751 
2752     // Not a leaf but we have last_Java_frame setup as we want.
2753     // We don't want to unmount in case of contention since that would complicate preserving
2754     // the arguments that had already been marshalled into the native convention. So we force
2755     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2756     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2757     __ push_cont_fastpath();
2758     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2759     __ pop_cont_fastpath();
2760     restore_args(masm, total_c_args, c_arg, out_regs);
2761 
2762 #ifdef ASSERT
2763     { Label L;
2764     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2765     __ jcc(Assembler::equal, L);
2766     __ stop("no pending exception allowed on exit from monitorenter");
2767     __ bind(L);
2768     }
2769 #endif
2770     __ jmp(lock_done);
2771 
2772     // END Slow path lock
2773 
2774     // BEGIN Slow path unlock
2775     __ bind(slow_path_unlock);
2776 
2777     // If we haven't already saved the native result we must save it now as xmm registers
2778     // are still exposed.
2779     __ vzeroupper();
2780     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2781       save_native_result(masm, ret_type, stack_slots);
2782     }
2783 
2784     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2785 
2786     __ mov(c_rarg0, obj_reg);
2787     __ mov(c_rarg2, r15_thread);
2788     __ mov(r12, rsp); // remember sp
2789     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2790     __ andptr(rsp, -16); // align stack as required by ABI
2791 
2792     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2793     // NOTE that obj_reg == rbx currently
2794     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2795     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2796 
2797     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2798     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2799     __ mov(rsp, r12); // restore sp
2800     __ reinit_heapbase();
2801 #ifdef ASSERT
2802     {
2803       Label L;
2804       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2805       __ jcc(Assembler::equal, L);
2806       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2807       __ bind(L);
2808     }
2809 #endif /* ASSERT */
2810 
2811     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2812 
2813     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2814       restore_native_result(masm, ret_type, stack_slots);
2815     }
2816     __ jmp(unlock_done);
2817 
2818     // END Slow path unlock
2819 
2820   } // synchronized
2821 
2822   // SLOW PATH Reguard the stack if needed
2823 
2824   __ bind(reguard);
2825   __ vzeroupper();
2826   save_native_result(masm, ret_type, stack_slots);
2827   __ mov(r12, rsp); // remember sp
2828   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2829   __ andptr(rsp, -16); // align stack as required by ABI
2830   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2831   __ mov(rsp, r12); // restore sp
2832   __ reinit_heapbase();
2833   restore_native_result(masm, ret_type, stack_slots);
2834   // and continue
2835   __ jmp(reguard_done);
2836 
2837 
2838 
2839   __ flush();
2840 
2841   nmethod *nm = nmethod::new_native_nmethod(method,
2842                                             compile_id,
2843                                             masm->code(),
2844                                             vep_offset,
2845                                             frame_complete,
2846                                             stack_slots / VMRegImpl::slots_per_word,
2847                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2848                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2849                                             oop_maps);
2850 
2851   return nm;
2852 }
2853 
2854 // this function returns the adjust size (in number of words) to a c2i adapter
2855 // activation for use during deoptimization
2856 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2857   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2858 }
2859 
2860 
2861 uint SharedRuntime::out_preserve_stack_slots() {
2862   return 0;
2863 }
2864 
2865 
2866 // Number of stack slots between incoming argument block and the start of
2867 // a new frame.  The PROLOG must add this many slots to the stack.  The
2868 // EPILOG must remove this many slots.  amd64 needs two slots for
2869 // return address.
2870 uint SharedRuntime::in_preserve_stack_slots() {
2871   return 4 + 2 * VerifyStackAtCalls;
2872 }
2873 
2874 VMReg SharedRuntime::thread_register() {
2875   return r15_thread->as_VMReg();
2876 }
2877 
2878 //------------------------------generate_deopt_blob----------------------------
2879 void SharedRuntime::generate_deopt_blob() {
2880   // Allocate space for the code
2881   ResourceMark rm;
2882   // Setup code generation tools
2883   int pad = 0;
2884   if (UseAVX > 2) {
2885     pad += 1024;
2886   }
2887   if (UseAPX) {
2888     pad += 1024;
2889   }
2890 #if INCLUDE_JVMCI
2891   if (EnableJVMCI) {
2892     pad += 512; // Increase the buffer size when compiling for JVMCI
2893   }
2894 #endif
2895   const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id);
2896   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)SharedStubId::deopt_id, name);
2897   if (blob != nullptr) {
2898     _deopt_blob = blob->as_deoptimization_blob();
2899     return;
2900   }
2901 
2902   CodeBuffer buffer(name, 2560+pad, 1024);
2903   MacroAssembler* masm = new MacroAssembler(&buffer);
2904   int frame_size_in_words;
2905   OopMap* map = nullptr;
2906   OopMapSet *oop_maps = new OopMapSet();
2907 
2908   // -------------
2909   // This code enters when returning to a de-optimized nmethod.  A return
2910   // address has been pushed on the stack, and return values are in
2911   // registers.
2912   // If we are doing a normal deopt then we were called from the patched
2913   // nmethod from the point we returned to the nmethod. So the return
2914   // address on the stack is wrong by NativeCall::instruction_size
2915   // We will adjust the value so it looks like we have the original return
2916   // address on the stack (like when we eagerly deoptimized).
2917   // In the case of an exception pending when deoptimizing, we enter
2918   // with a return address on the stack that points after the call we patched
2919   // into the exception handler. We have the following register state from,
2920   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2921   //    rax: exception oop
2922   //    rbx: exception handler
2923   //    rdx: throwing pc
2924   // So in this case we simply jam rdx into the useless return address and
2925   // the stack looks just like we want.
2926   //
2927   // At this point we need to de-opt.  We save the argument return
2928   // registers.  We call the first C routine, fetch_unroll_info().  This
2929   // routine captures the return values and returns a structure which
2930   // describes the current frame size and the sizes of all replacement frames.
2931   // The current frame is compiled code and may contain many inlined
2932   // functions, each with their own JVM state.  We pop the current frame, then
2933   // push all the new frames.  Then we call the C routine unpack_frames() to
2934   // populate these frames.  Finally unpack_frames() returns us the new target
2935   // address.  Notice that callee-save registers are BLOWN here; they have
2936   // already been captured in the vframeArray at the time the return PC was
2937   // patched.
2938   address start = __ pc();
2939   Label cont;
2940 
2941   // Prolog for non exception case!
2942 
2943   // Save everything in sight.
2944   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2945 
2946   // Normal deoptimization.  Save exec mode for unpack_frames.
2947   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2948   __ jmp(cont);
2949 
2950   int reexecute_offset = __ pc() - start;
2951 #if INCLUDE_JVMCI && !defined(COMPILER1)
2952   if (UseJVMCICompiler) {
2953     // JVMCI does not use this kind of deoptimization
2954     __ should_not_reach_here();
2955   }
2956 #endif
2957 
2958   // Reexecute case
2959   // return address is the pc describes what bci to do re-execute at
2960 
2961   // No need to update map as each call to save_live_registers will produce identical oopmap
2962   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2963 
2964   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2965   __ jmp(cont);
2966 
2967 #if INCLUDE_JVMCI
2968   Label after_fetch_unroll_info_call;
2969   int implicit_exception_uncommon_trap_offset = 0;
2970   int uncommon_trap_offset = 0;
2971 
2972   if (EnableJVMCI) {
2973     implicit_exception_uncommon_trap_offset = __ pc() - start;
2974 
2975     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2976     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2977 
2978     uncommon_trap_offset = __ pc() - start;
2979 
2980     // Save everything in sight.
2981     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2982     // fetch_unroll_info needs to call last_java_frame()
2983     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2984 
2985     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2986     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2987 
2988     __ movl(r14, Deoptimization::Unpack_reexecute);
2989     __ mov(c_rarg0, r15_thread);
2990     __ movl(c_rarg2, r14); // exec mode
2991     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2992     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2993 
2994     __ reset_last_Java_frame(false);
2995 
2996     __ jmp(after_fetch_unroll_info_call);
2997   } // EnableJVMCI
2998 #endif // INCLUDE_JVMCI
2999 
3000   int exception_offset = __ pc() - start;
3001 
3002   // Prolog for exception case
3003 
3004   // all registers are dead at this entry point, except for rax, and
3005   // rdx which contain the exception oop and exception pc
3006   // respectively.  Set them in TLS and fall thru to the
3007   // unpack_with_exception_in_tls entry point.
3008 
3009   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3010   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
3011 
3012   int exception_in_tls_offset = __ pc() - start;
3013 
3014   // new implementation because exception oop is now passed in JavaThread
3015 
3016   // Prolog for exception case
3017   // All registers must be preserved because they might be used by LinearScan
3018   // Exceptiop oop and throwing PC are passed in JavaThread
3019   // tos: stack at point of call to method that threw the exception (i.e. only
3020   // args are on the stack, no return address)
3021 
3022   // make room on stack for the return address
3023   // It will be patched later with the throwing pc. The correct value is not
3024   // available now because loading it from memory would destroy registers.
3025   __ push(0);
3026 
3027   // Save everything in sight.
3028   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
3029 
3030   // Now it is safe to overwrite any register
3031 
3032   // Deopt during an exception.  Save exec mode for unpack_frames.
3033   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
3034 
3035   // load throwing pc from JavaThread and patch it as the return address
3036   // of the current frame. Then clear the field in JavaThread
3037 
3038   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3039   __ movptr(Address(rbp, wordSize), rdx);
3040   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3041 
3042 #ifdef ASSERT
3043   // verify that there is really an exception oop in JavaThread
3044   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3045   __ verify_oop(rax);
3046 
3047   // verify that there is no pending exception
3048   Label no_pending_exception;
3049   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3050   __ testptr(rax, rax);
3051   __ jcc(Assembler::zero, no_pending_exception);
3052   __ stop("must not have pending exception here");
3053   __ bind(no_pending_exception);
3054 #endif
3055 
3056   __ bind(cont);
3057 
3058   // Call C code.  Need thread and this frame, but NOT official VM entry
3059   // crud.  We cannot block on this call, no GC can happen.
3060   //
3061   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
3062 
3063   // fetch_unroll_info needs to call last_java_frame().
3064 
3065   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3066 #ifdef ASSERT
3067   { Label L;
3068     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3069     __ jcc(Assembler::equal, L);
3070     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
3071     __ bind(L);
3072   }
3073 #endif // ASSERT
3074   __ mov(c_rarg0, r15_thread);
3075   __ movl(c_rarg1, r14); // exec_mode
3076   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
3077 
3078   // Need to have an oopmap that tells fetch_unroll_info where to
3079   // find any register it might need.
3080   oop_maps->add_gc_map(__ pc() - start, map);
3081 
3082   __ reset_last_Java_frame(false);
3083 
3084 #if INCLUDE_JVMCI
3085   if (EnableJVMCI) {
3086     __ bind(after_fetch_unroll_info_call);
3087   }
3088 #endif
3089 
3090   // Load UnrollBlock* into rdi
3091   __ mov(rdi, rax);
3092 
3093   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
3094    Label noException;
3095   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
3096   __ jcc(Assembler::notEqual, noException);
3097   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3098   // QQQ this is useless it was null above
3099   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3100   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3101   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3102 
3103   __ verify_oop(rax);
3104 
3105   // Overwrite the result registers with the exception results.
3106   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3107   // I think this is useless
3108   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
3109 
3110   __ bind(noException);
3111 
3112   // Only register save data is on the stack.
3113   // Now restore the result registers.  Everything else is either dead
3114   // or captured in the vframeArray.
3115   RegisterSaver::restore_result_registers(masm);
3116 
3117   // All of the register save area has been popped of the stack. Only the
3118   // return address remains.
3119 
3120   // Pop all the frames we must move/replace.
3121   //
3122   // Frame picture (youngest to oldest)
3123   // 1: self-frame (no frame link)
3124   // 2: deopting frame  (no frame link)
3125   // 3: caller of deopting frame (could be compiled/interpreted).
3126   //
3127   // Note: by leaving the return address of self-frame on the stack
3128   // and using the size of frame 2 to adjust the stack
3129   // when we are done the return to frame 3 will still be on the stack.
3130 
3131   // Pop deoptimized frame
3132   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3133   __ addptr(rsp, rcx);
3134 
3135   // rsp should be pointing at the return address to the caller (3)
3136 
3137   // Pick up the initial fp we should save
3138   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3139   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3140 
3141 #ifdef ASSERT
3142   // Compilers generate code that bang the stack by as much as the
3143   // interpreter would need. So this stack banging should never
3144   // trigger a fault. Verify that it does not on non product builds.
3145   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3146   __ bang_stack_size(rbx, rcx);
3147 #endif
3148 
3149   // Load address of array of frame pcs into rcx
3150   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3151 
3152   // Trash the old pc
3153   __ addptr(rsp, wordSize);
3154 
3155   // Load address of array of frame sizes into rsi
3156   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3157 
3158   // Load counter into rdx
3159   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3160 
3161   // Now adjust the caller's stack to make up for the extra locals
3162   // but record the original sp so that we can save it in the skeletal interpreter
3163   // frame and the stack walking of interpreter_sender will get the unextended sp
3164   // value and not the "real" sp value.
3165 
3166   const Register sender_sp = r8;
3167 
3168   __ mov(sender_sp, rsp);
3169   __ movl(rbx, Address(rdi,
3170                        Deoptimization::UnrollBlock::
3171                        caller_adjustment_offset()));
3172   __ subptr(rsp, rbx);
3173 
3174   // Push interpreter frames in a loop
3175   Label loop;
3176   __ bind(loop);
3177   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3178   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3179   __ pushptr(Address(rcx, 0));          // Save return address
3180   __ enter();                           // Save old & set new ebp
3181   __ subptr(rsp, rbx);                  // Prolog
3182   // This value is corrected by layout_activation_impl
3183   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3184   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3185   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3186   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3187   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3188   __ decrementl(rdx);                   // Decrement counter
3189   __ jcc(Assembler::notZero, loop);
3190   __ pushptr(Address(rcx, 0));          // Save final return address
3191 
3192   // Re-push self-frame
3193   __ enter();                           // Save old & set new ebp
3194 
3195   // Allocate a full sized register save area.
3196   // Return address and rbp are in place, so we allocate two less words.
3197   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3198 
3199   // Restore frame locals after moving the frame
3200   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3201   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3202 
3203   // Call C code.  Need thread but NOT official VM entry
3204   // crud.  We cannot block on this call, no GC can happen.  Call should
3205   // restore return values to their stack-slots with the new SP.
3206   //
3207   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3208 
3209   // Use rbp because the frames look interpreted now
3210   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3211   // Don't need the precise return PC here, just precise enough to point into this code blob.
3212   address the_pc = __ pc();
3213   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3214 
3215   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3216   __ mov(c_rarg0, r15_thread);
3217   __ movl(c_rarg1, r14); // second arg: exec_mode
3218   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3219   // Revert SP alignment after call since we're going to do some SP relative addressing below
3220   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3221 
3222   // Set an oopmap for the call site
3223   // Use the same PC we used for the last java frame
3224   oop_maps->add_gc_map(the_pc - start,
3225                        new OopMap( frame_size_in_words, 0 ));
3226 
3227   // Clear fp AND pc
3228   __ reset_last_Java_frame(true);
3229 
3230   // Collect return values
3231   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3232   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3233   // I think this is useless (throwing pc?)
3234   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3235 
3236   // Pop self-frame.
3237   __ leave();                           // Epilog
3238 
3239   // Jump to interpreter
3240   __ ret(0);
3241 
3242   // Make sure all code is generated
3243   masm->flush();
3244 
3245   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3246   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3247 #if INCLUDE_JVMCI
3248   if (EnableJVMCI) {
3249     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3250     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3251   }
3252 #endif
3253 
3254   AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, (uint)SharedStubId::deopt_id, name);
3255 }
3256 
3257 //------------------------------generate_handler_blob------
3258 //
3259 // Generate a special Compile2Runtime blob that saves all registers,
3260 // and setup oopmap.
3261 //
3262 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) {
3263   assert(StubRoutines::forward_exception_entry() != nullptr,
3264          "must be generated before");
3265   assert(is_polling_page_id(id), "expected a polling page stub id");
3266 
3267   // Allocate space for the code.  Setup code generation tools.
3268   const char* name = SharedRuntime::stub_name(id);
3269   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)id, name);
3270   if (blob != nullptr) {
3271     return blob->as_safepoint_blob();
3272   }
3273 
3274   ResourceMark rm;
3275   OopMapSet *oop_maps = new OopMapSet();
3276   OopMap* map;
3277   CodeBuffer buffer(name, 2548, 1024);
3278   MacroAssembler* masm = new MacroAssembler(&buffer);
3279 
3280   address start   = __ pc();
3281   address call_pc = nullptr;
3282   int frame_size_in_words;
3283   bool cause_return = (id == SharedStubId::polling_page_return_handler_id);
3284   bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id);
3285 
3286   // Make room for return address (or push it again)
3287   if (!cause_return) {
3288     __ push(rbx);
3289   }
3290 
3291   // Save registers, fpu state, and flags
3292   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3293 
3294   // The following is basically a call_VM.  However, we need the precise
3295   // address of the call in order to generate an oopmap. Hence, we do all the
3296   // work ourselves.
3297 
3298   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3299 
3300   // The return address must always be correct so that frame constructor never
3301   // sees an invalid pc.
3302 
3303   if (!cause_return) {
3304     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3305     // Additionally, rbx is a callee saved register and we can look at it later to determine
3306     // if someone changed the return address for us!
3307     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3308     __ movptr(Address(rbp, wordSize), rbx);
3309   }
3310 
3311   // Do the call
3312   __ mov(c_rarg0, r15_thread);
3313   __ call(RuntimeAddress(call_ptr));
3314 
3315   // Set an oopmap for the call site.  This oopmap will map all
3316   // oop-registers and debug-info registers as callee-saved.  This
3317   // will allow deoptimization at this safepoint to find all possible
3318   // debug-info recordings, as well as let GC find all oops.
3319 
3320   oop_maps->add_gc_map( __ pc() - start, map);
3321 
3322   Label noException;
3323 
3324   __ reset_last_Java_frame(false);
3325 
3326   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3327   __ jcc(Assembler::equal, noException);
3328 
3329   // Exception pending
3330 
3331   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3332 
3333   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3334 
3335   // No exception case
3336   __ bind(noException);
3337 
3338   Label no_adjust;
3339 #ifdef ASSERT
3340   Label bail;
3341 #endif
3342   if (!cause_return) {
3343     Label no_prefix, not_special, check_rex_prefix;
3344 
3345     // If our stashed return pc was modified by the runtime we avoid touching it
3346     __ cmpptr(rbx, Address(rbp, wordSize));
3347     __ jcc(Assembler::notEqual, no_adjust);
3348 
3349     // Skip over the poll instruction.
3350     // See NativeInstruction::is_safepoint_poll()
3351     // Possible encodings:
3352     //      85 00       test   %eax,(%rax)
3353     //      85 01       test   %eax,(%rcx)
3354     //      85 02       test   %eax,(%rdx)
3355     //      85 03       test   %eax,(%rbx)
3356     //      85 06       test   %eax,(%rsi)
3357     //      85 07       test   %eax,(%rdi)
3358     //
3359     //   41 85 00       test   %eax,(%r8)
3360     //   41 85 01       test   %eax,(%r9)
3361     //   41 85 02       test   %eax,(%r10)
3362     //   41 85 03       test   %eax,(%r11)
3363     //   41 85 06       test   %eax,(%r14)
3364     //   41 85 07       test   %eax,(%r15)
3365     //
3366     //      85 04 24    test   %eax,(%rsp)
3367     //   41 85 04 24    test   %eax,(%r12)
3368     //      85 45 00    test   %eax,0x0(%rbp)
3369     //   41 85 45 00    test   %eax,0x0(%r13)
3370     //
3371     // Notes:
3372     //  Format of legacy MAP0 test instruction:-
3373     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3374     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3375     //     operand and base register of memory operand is b/w [0-8), hence we do not require
3376     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3377     //     is why two bytes encoding is sufficient here.
3378     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3379     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
3380     //     there by adding additional byte to instruction encoding.
3381     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
3382     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3383     //     most significant two bits of 5 bit register encoding.
3384 
3385     if (VM_Version::supports_apx_f()) {
3386       __ cmpb(Address(rbx, 0), Assembler::REX2);
3387       __ jccb(Assembler::notEqual, check_rex_prefix);
3388       __ addptr(rbx, 2);
3389       __ bind(check_rex_prefix);
3390     }
3391     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3392     __ jccb(Assembler::notEqual, no_prefix);
3393     __ addptr(rbx, 1);
3394     __ bind(no_prefix);
3395 #ifdef ASSERT
3396     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3397 #endif
3398     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3399     // r12/rsp 0x04
3400     // r13/rbp 0x05
3401     __ movzbq(rcx, Address(rbx, 1));
3402     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3403     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3404     __ cmpptr(rcx, 1);
3405     __ jccb(Assembler::above, not_special);
3406     __ addptr(rbx, 1);
3407     __ bind(not_special);
3408 #ifdef ASSERT
3409     // Verify the correct encoding of the poll we're about to skip.
3410     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3411     __ jcc(Assembler::notEqual, bail);
3412     // Mask out the modrm bits
3413     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3414     // rax encodes to 0, so if the bits are nonzero it's incorrect
3415     __ jcc(Assembler::notZero, bail);
3416 #endif
3417     // Adjust return pc forward to step over the safepoint poll instruction
3418     __ addptr(rbx, 2);
3419     __ movptr(Address(rbp, wordSize), rbx);
3420   }
3421 
3422   __ bind(no_adjust);
3423   // Normal exit, restore registers and exit.
3424   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3425   __ ret(0);
3426 
3427 #ifdef ASSERT
3428   __ bind(bail);
3429   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3430 #endif
3431 
3432   // Make sure all code is generated
3433   masm->flush();
3434 
3435   // Fill-out other meta info
3436   SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3437 
3438   AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, (uint)id, name);
3439   return sp_blob;
3440 }
3441 
3442 //
3443 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3444 //
3445 // Generate a stub that calls into vm to find out the proper destination
3446 // of a java call. All the argument registers are live at this point
3447 // but since this is generic code we don't know what they are and the caller
3448 // must do any gc of the args.
3449 //
3450 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) {
3451   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3452   assert(is_resolve_id(id), "expected a resolve stub id");
3453 
3454   const char* name = SharedRuntime::stub_name(id);
3455   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)id, name);
3456   if (blob != nullptr) {
3457     return blob->as_runtime_stub();
3458   }
3459 
3460   // allocate space for the code
3461   ResourceMark rm;
3462   CodeBuffer buffer(name, 1552, 512);
3463   MacroAssembler* masm = new MacroAssembler(&buffer);
3464 
3465   int frame_size_in_words;
3466 
3467   OopMapSet *oop_maps = new OopMapSet();
3468   OopMap* map = nullptr;
3469 
3470   int start = __ offset();
3471 
3472   // No need to save vector registers since they are caller-saved anyway.
3473   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3474 
3475   int frame_complete = __ offset();
3476 
3477   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3478 
3479   __ mov(c_rarg0, r15_thread);
3480 
3481   __ call(RuntimeAddress(destination));
3482 
3483 
3484   // Set an oopmap for the call site.
3485   // We need this not only for callee-saved registers, but also for volatile
3486   // registers that the compiler might be keeping live across a safepoint.
3487 
3488   oop_maps->add_gc_map( __ offset() - start, map);
3489 
3490   // rax contains the address we are going to jump to assuming no exception got installed
3491 
3492   // clear last_Java_sp
3493   __ reset_last_Java_frame(false);
3494   // check for pending exceptions
3495   Label pending;
3496   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3497   __ jcc(Assembler::notEqual, pending);
3498 
3499   // get the returned Method*
3500   __ get_vm_result_metadata(rbx);
3501   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3502 
3503   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3504 
3505   RegisterSaver::restore_live_registers(masm);
3506 
3507   // We are back to the original state on entry and ready to go.
3508 
3509   __ jmp(rax);
3510 
3511   // Pending exception after the safepoint
3512 
3513   __ bind(pending);
3514 
3515   RegisterSaver::restore_live_registers(masm);
3516 
3517   // exception pending => remove activation and forward to exception handler
3518 
3519   __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3520 
3521   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3522   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3523 
3524   // -------------
3525   // make sure all code is generated
3526   masm->flush();
3527 
3528   // return the  blob
3529   // frame_size_words or bytes??
3530   RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3531 
3532   AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, (uint)id, name);
3533   return rs_blob;
3534 }
3535 
3536 // Continuation point for throwing of implicit exceptions that are
3537 // not handled in the current activation. Fabricates an exception
3538 // oop and initiates normal exception dispatching in this
3539 // frame. Since we need to preserve callee-saved values (currently
3540 // only for C2, but done for C1 as well) we need a callee-saved oop
3541 // map and therefore have to make these stubs into RuntimeStubs
3542 // rather than BufferBlobs.  If the compiler needs all registers to
3543 // be preserved between the fault point and the exception handler
3544 // then it must assume responsibility for that in
3545 // AbstractCompiler::continuation_for_implicit_null_exception or
3546 // continuation_for_implicit_division_by_zero_exception. All other
3547 // implicit exceptions (e.g., NullPointerException or
3548 // AbstractMethodError on entry) are either at call sites or
3549 // otherwise assume that stack unwinding will be initiated, so
3550 // caller saved registers were assumed volatile in the compiler.
3551 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) {
3552   assert(is_throw_id(id), "expected a throw stub id");
3553 
3554   const char* name = SharedRuntime::stub_name(id);
3555 
3556   // Information about frame layout at time of blocking runtime call.
3557   // Note that we only have to preserve callee-saved registers since
3558   // the compilers are responsible for supplying a continuation point
3559   // if they expect all registers to be preserved.
3560   enum layout {
3561     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3562     rbp_off2,
3563     return_off,
3564     return_off2,
3565     framesize // inclusive of return address
3566   };
3567 
3568   int insts_size = 512;
3569   int locs_size  = 64;
3570 
3571   const char* timer_msg = "SharedRuntime generate_throw_exception";
3572   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3573 
3574   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, (uint)id, name);
3575   if (blob != nullptr) {
3576     return blob->as_runtime_stub();
3577   }
3578 
3579   ResourceMark rm;
3580   CodeBuffer code(name, insts_size, locs_size);
3581   OopMapSet* oop_maps  = new OopMapSet();
3582   MacroAssembler* masm = new MacroAssembler(&code);
3583 
3584   address start = __ pc();
3585 
3586   // This is an inlined and slightly modified version of call_VM
3587   // which has the ability to fetch the return PC out of
3588   // thread-local storage and also sets up last_Java_sp slightly
3589   // differently than the real call_VM
3590 
3591   __ enter(); // required for proper stackwalking of RuntimeStub frame
3592 
3593   assert(is_even(framesize/2), "sp not 16-byte aligned");
3594 
3595   // return address and rbp are already in place
3596   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3597 
3598   int frame_complete = __ pc() - start;
3599 
3600   // Set up last_Java_sp and last_Java_fp
3601   address the_pc = __ pc();
3602   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3603   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3604 
3605   // Call runtime
3606   __ movptr(c_rarg0, r15_thread);
3607   BLOCK_COMMENT("call runtime_entry");
3608   __ call(RuntimeAddress(runtime_entry));
3609 
3610   // Generate oop map
3611   OopMap* map = new OopMap(framesize, 0);
3612 
3613   oop_maps->add_gc_map(the_pc - start, map);
3614 
3615   __ reset_last_Java_frame(true);
3616 
3617   __ leave(); // required for proper stackwalking of RuntimeStub frame
3618 
3619   // check for pending exceptions
3620 #ifdef ASSERT
3621   Label L;
3622   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3623   __ jcc(Assembler::notEqual, L);
3624   __ should_not_reach_here();
3625   __ bind(L);
3626 #endif // ASSERT
3627   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3628 
3629 
3630   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3631   RuntimeStub* stub =
3632     RuntimeStub::new_runtime_stub(name,
3633                                   &code,
3634                                   frame_complete,
3635                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3636                                   oop_maps, false);
3637   AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, (uint)id, name);
3638 
3639   return stub;
3640 }
3641 
3642 //------------------------------Montgomery multiplication------------------------
3643 //
3644 
3645 #ifndef _WINDOWS
3646 
3647 // Subtract 0:b from carry:a.  Return carry.
3648 static julong
3649 sub(julong a[], julong b[], julong carry, long len) {
3650   long long i = 0, cnt = len;
3651   julong tmp;
3652   asm volatile("clc; "
3653                "0: ; "
3654                "mov (%[b], %[i], 8), %[tmp]; "
3655                "sbb %[tmp], (%[a], %[i], 8); "
3656                "inc %[i]; dec %[cnt]; "
3657                "jne 0b; "
3658                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3659                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3660                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3661                : "memory");
3662   return tmp;
3663 }
3664 
3665 // Multiply (unsigned) Long A by Long B, accumulating the double-
3666 // length result into the accumulator formed of T0, T1, and T2.
3667 #define MACC(A, B, T0, T1, T2)                                  \
3668 do {                                                            \
3669   unsigned long hi, lo;                                         \
3670   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3671            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3672            : "r"(A), "a"(B) : "cc");                            \
3673  } while(0)
3674 
3675 // As above, but add twice the double-length result into the
3676 // accumulator.
3677 #define MACC2(A, B, T0, T1, T2)                                 \
3678 do {                                                            \
3679   unsigned long hi, lo;                                         \
3680   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3681            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3682            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3683            : "r"(A), "a"(B) : "cc");                            \
3684  } while(0)
3685 
3686 #else //_WINDOWS
3687 
3688 static julong
3689 sub(julong a[], julong b[], julong carry, long len) {
3690   long i;
3691   julong tmp;
3692   unsigned char c = 1;
3693   for (i = 0; i < len; i++) {
3694     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3695     a[i] = tmp;
3696   }
3697   c = _addcarry_u64(c, carry, ~0, &tmp);
3698   return tmp;
3699 }
3700 
3701 // Multiply (unsigned) Long A by Long B, accumulating the double-
3702 // length result into the accumulator formed of T0, T1, and T2.
3703 #define MACC(A, B, T0, T1, T2)                          \
3704 do {                                                    \
3705   julong hi, lo;                            \
3706   lo = _umul128(A, B, &hi);                             \
3707   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3708   c = _addcarry_u64(c, hi, T1, &T1);                    \
3709   _addcarry_u64(c, T2, 0, &T2);                         \
3710  } while(0)
3711 
3712 // As above, but add twice the double-length result into the
3713 // accumulator.
3714 #define MACC2(A, B, T0, T1, T2)                         \
3715 do {                                                    \
3716   julong hi, lo;                            \
3717   lo = _umul128(A, B, &hi);                             \
3718   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3719   c = _addcarry_u64(c, hi, T1, &T1);                    \
3720   _addcarry_u64(c, T2, 0, &T2);                         \
3721   c = _addcarry_u64(0, lo, T0, &T0);                    \
3722   c = _addcarry_u64(c, hi, T1, &T1);                    \
3723   _addcarry_u64(c, T2, 0, &T2);                         \
3724  } while(0)
3725 
3726 #endif //_WINDOWS
3727 
3728 // Fast Montgomery multiplication.  The derivation of the algorithm is
3729 // in  A Cryptographic Library for the Motorola DSP56000,
3730 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3731 
3732 static void NOINLINE
3733 montgomery_multiply(julong a[], julong b[], julong n[],
3734                     julong m[], julong inv, int len) {
3735   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3736   int i;
3737 
3738   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3739 
3740   for (i = 0; i < len; i++) {
3741     int j;
3742     for (j = 0; j < i; j++) {
3743       MACC(a[j], b[i-j], t0, t1, t2);
3744       MACC(m[j], n[i-j], t0, t1, t2);
3745     }
3746     MACC(a[i], b[0], t0, t1, t2);
3747     m[i] = t0 * inv;
3748     MACC(m[i], n[0], t0, t1, t2);
3749 
3750     assert(t0 == 0, "broken Montgomery multiply");
3751 
3752     t0 = t1; t1 = t2; t2 = 0;
3753   }
3754 
3755   for (i = len; i < 2*len; i++) {
3756     int j;
3757     for (j = i-len+1; j < len; j++) {
3758       MACC(a[j], b[i-j], t0, t1, t2);
3759       MACC(m[j], n[i-j], t0, t1, t2);
3760     }
3761     m[i-len] = t0;
3762     t0 = t1; t1 = t2; t2 = 0;
3763   }
3764 
3765   while (t0)
3766     t0 = sub(m, n, t0, len);
3767 }
3768 
3769 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3770 // multiplies so it should be up to 25% faster than Montgomery
3771 // multiplication.  However, its loop control is more complex and it
3772 // may actually run slower on some machines.
3773 
3774 static void NOINLINE
3775 montgomery_square(julong a[], julong n[],
3776                   julong m[], julong inv, int len) {
3777   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3778   int i;
3779 
3780   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3781 
3782   for (i = 0; i < len; i++) {
3783     int j;
3784     int end = (i+1)/2;
3785     for (j = 0; j < end; j++) {
3786       MACC2(a[j], a[i-j], t0, t1, t2);
3787       MACC(m[j], n[i-j], t0, t1, t2);
3788     }
3789     if ((i & 1) == 0) {
3790       MACC(a[j], a[j], t0, t1, t2);
3791     }
3792     for (; j < i; j++) {
3793       MACC(m[j], n[i-j], t0, t1, t2);
3794     }
3795     m[i] = t0 * inv;
3796     MACC(m[i], n[0], t0, t1, t2);
3797 
3798     assert(t0 == 0, "broken Montgomery square");
3799 
3800     t0 = t1; t1 = t2; t2 = 0;
3801   }
3802 
3803   for (i = len; i < 2*len; i++) {
3804     int start = i-len+1;
3805     int end = start + (len - start)/2;
3806     int j;
3807     for (j = start; j < end; j++) {
3808       MACC2(a[j], a[i-j], t0, t1, t2);
3809       MACC(m[j], n[i-j], t0, t1, t2);
3810     }
3811     if ((i & 1) == 0) {
3812       MACC(a[j], a[j], t0, t1, t2);
3813     }
3814     for (; j < len; j++) {
3815       MACC(m[j], n[i-j], t0, t1, t2);
3816     }
3817     m[i-len] = t0;
3818     t0 = t1; t1 = t2; t2 = 0;
3819   }
3820 
3821   while (t0)
3822     t0 = sub(m, n, t0, len);
3823 }
3824 
3825 // Swap words in a longword.
3826 static julong swap(julong x) {
3827   return (x << 32) | (x >> 32);
3828 }
3829 
3830 // Copy len longwords from s to d, word-swapping as we go.  The
3831 // destination array is reversed.
3832 static void reverse_words(julong *s, julong *d, int len) {
3833   d += len;
3834   while(len-- > 0) {
3835     d--;
3836     *d = swap(*s);
3837     s++;
3838   }
3839 }
3840 
3841 // The threshold at which squaring is advantageous was determined
3842 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3843 #define MONTGOMERY_SQUARING_THRESHOLD 64
3844 
3845 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3846                                         jint len, jlong inv,
3847                                         jint *m_ints) {
3848   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3849   int longwords = len/2;
3850 
3851   // Make very sure we don't use so much space that the stack might
3852   // overflow.  512 jints corresponds to an 16384-bit integer and
3853   // will use here a total of 8k bytes of stack space.
3854   int divisor = sizeof(julong) * 4;
3855   guarantee(longwords <= 8192 / divisor, "must be");
3856   int total_allocation = longwords * sizeof (julong) * 4;
3857   julong *scratch = (julong *)alloca(total_allocation);
3858 
3859   // Local scratch arrays
3860   julong
3861     *a = scratch + 0 * longwords,
3862     *b = scratch + 1 * longwords,
3863     *n = scratch + 2 * longwords,
3864     *m = scratch + 3 * longwords;
3865 
3866   reverse_words((julong *)a_ints, a, longwords);
3867   reverse_words((julong *)b_ints, b, longwords);
3868   reverse_words((julong *)n_ints, n, longwords);
3869 
3870   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3871 
3872   reverse_words(m, (julong *)m_ints, longwords);
3873 }
3874 
3875 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3876                                       jint len, jlong inv,
3877                                       jint *m_ints) {
3878   assert(len % 2 == 0, "array length in montgomery_square must be even");
3879   int longwords = len/2;
3880 
3881   // Make very sure we don't use so much space that the stack might
3882   // overflow.  512 jints corresponds to an 16384-bit integer and
3883   // will use here a total of 6k bytes of stack space.
3884   int divisor = sizeof(julong) * 3;
3885   guarantee(longwords <= (8192 / divisor), "must be");
3886   int total_allocation = longwords * sizeof (julong) * 3;
3887   julong *scratch = (julong *)alloca(total_allocation);
3888 
3889   // Local scratch arrays
3890   julong
3891     *a = scratch + 0 * longwords,
3892     *n = scratch + 1 * longwords,
3893     *m = scratch + 2 * longwords;
3894 
3895   reverse_words((julong *)a_ints, a, longwords);
3896   reverse_words((julong *)n_ints, n, longwords);
3897 
3898   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3899     ::montgomery_square(a, n, m, (julong)inv, longwords);
3900   } else {
3901     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3902   }
3903 
3904   reverse_words(m, (julong *)m_ints, longwords);
3905 }
3906 
3907 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3908   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3909   CodeBuffer buffer(buf);
3910   short buffer_locs[20];
3911   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3912                                          sizeof(buffer_locs)/sizeof(relocInfo));
3913 
3914   MacroAssembler* masm = new MacroAssembler(&buffer);
3915 
3916   const Array<SigEntry>* sig_vk = vk->extended_sig();
3917   const Array<VMRegPair>* regs = vk->return_regs();
3918 
3919   int pack_fields_jobject_off = __ offset();
3920   // Resolve pre-allocated buffer from JNI handle.
3921   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3922   __ movptr(rax, Address(r13, 0));
3923   __ resolve_jobject(rax /* value */,
3924                      r12 /* tmp */);
3925   __ movptr(Address(r13, 0), rax);
3926 
3927   int pack_fields_off = __ offset();
3928 
3929   int j = 1;
3930   for (int i = 0; i < sig_vk->length(); i++) {
3931     BasicType bt = sig_vk->at(i)._bt;
3932     if (bt == T_METADATA) {
3933       continue;
3934     }
3935     if (bt == T_VOID) {
3936       if (sig_vk->at(i-1)._bt == T_LONG ||
3937           sig_vk->at(i-1)._bt == T_DOUBLE) {
3938         j++;
3939       }
3940       continue;
3941     }
3942     int off = sig_vk->at(i)._offset;
3943     assert(off > 0, "offset in object should be positive");
3944     VMRegPair pair = regs->at(j);
3945     VMReg r_1 = pair.first();
3946     VMReg r_2 = pair.second();
3947     Address to(rax, off);
3948     if (bt == T_FLOAT) {
3949       __ movflt(to, r_1->as_XMMRegister());
3950     } else if (bt == T_DOUBLE) {
3951       __ movdbl(to, r_1->as_XMMRegister());
3952     } else {
3953       Register val = r_1->as_Register();
3954       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
3955       if (is_reference_type(bt)) {
3956         __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
3957       } else {
3958         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
3959       }
3960     }
3961     j++;
3962   }
3963   assert(j == regs->length(), "missed a field?");
3964   if (vk->has_nullable_atomic_layout()) {
3965     // Set the null marker
3966     __ movb(Address(rax, vk->null_marker_offset()), 1);
3967   }
3968   __ ret(0);
3969 
3970   int unpack_fields_off = __ offset();
3971 
3972   Label skip;
3973   Label not_null;
3974   __ testptr(rax, rax);
3975   __ jcc(Assembler::notZero, not_null);
3976 
3977   // Return value is null. Zero oop registers to make the GC happy.
3978   j = 1;
3979   for (int i = 0; i < sig_vk->length(); i++) {
3980     BasicType bt = sig_vk->at(i)._bt;
3981     if (bt == T_METADATA) {
3982       continue;
3983     }
3984     if (bt == T_VOID) {
3985       if (sig_vk->at(i-1)._bt == T_LONG ||
3986           sig_vk->at(i-1)._bt == T_DOUBLE) {
3987         j++;
3988       }
3989       continue;
3990     }
3991     if (bt == T_OBJECT || bt == T_ARRAY) {
3992       VMRegPair pair = regs->at(j);
3993       VMReg r_1 = pair.first();
3994       __ xorq(r_1->as_Register(), r_1->as_Register());
3995     }
3996     j++;
3997   }
3998   __ jmp(skip);
3999   __ bind(not_null);
4000 
4001   j = 1;
4002   for (int i = 0; i < sig_vk->length(); i++) {
4003     BasicType bt = sig_vk->at(i)._bt;
4004     if (bt == T_METADATA) {
4005       continue;
4006     }
4007     if (bt == T_VOID) {
4008       if (sig_vk->at(i-1)._bt == T_LONG ||
4009           sig_vk->at(i-1)._bt == T_DOUBLE) {
4010         j++;
4011       }
4012       continue;
4013     }
4014     int off = sig_vk->at(i)._offset;
4015     assert(off > 0, "offset in object should be positive");
4016     VMRegPair pair = regs->at(j);
4017     VMReg r_1 = pair.first();
4018     VMReg r_2 = pair.second();
4019     Address from(rax, off);
4020     if (bt == T_FLOAT) {
4021       __ movflt(r_1->as_XMMRegister(), from);
4022     } else if (bt == T_DOUBLE) {
4023       __ movdbl(r_1->as_XMMRegister(), from);
4024     } else if (bt == T_OBJECT || bt == T_ARRAY) {
4025       assert_different_registers(rax, r_1->as_Register());
4026       __ load_heap_oop(r_1->as_Register(), from);
4027     } else {
4028       assert(is_java_primitive(bt), "unexpected basic type");
4029       assert_different_registers(rax, r_1->as_Register());
4030       size_t size_in_bytes = type2aelembytes(bt);
4031       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
4032     }
4033     j++;
4034   }
4035   assert(j == regs->length(), "missed a field?");
4036 
4037   __ bind(skip);
4038   __ ret(0);
4039 
4040   __ flush();
4041 
4042   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
4043 }
4044 
4045 #if INCLUDE_JFR
4046 
4047 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
4048 // It returns a jobject handle to the event writer.
4049 // The handle is dereferenced and the return value is the event writer oop.
4050 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
4051   enum layout {
4052     rbp_off,
4053     rbpH_off,
4054     return_off,
4055     return_off2,
4056     framesize // inclusive of return address
4057   };
4058 
4059   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id);
4060   CodeBuffer code(name, 1024, 64);
4061   MacroAssembler* masm = new MacroAssembler(&code);
4062   address start = __ pc();
4063 
4064   __ enter();
4065   address the_pc = __ pc();
4066 
4067   int frame_complete = the_pc - start;
4068 
4069   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
4070   __ movptr(c_rarg0, r15_thread);
4071   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
4072   __ reset_last_Java_frame(true);
4073 
4074   // rax is jobject handle result, unpack and process it through a barrier.
4075   __ resolve_global_jobject(rax, c_rarg0);
4076 
4077   __ leave();
4078   __ ret(0);
4079 
4080   OopMapSet* oop_maps = new OopMapSet();
4081   OopMap* map = new OopMap(framesize, 1);
4082   oop_maps->add_gc_map(frame_complete, map);
4083 
4084   RuntimeStub* stub =
4085     RuntimeStub::new_runtime_stub(name,
4086                                   &code,
4087                                   frame_complete,
4088                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4089                                   oop_maps,
4090                                   false);
4091   return stub;
4092 }
4093 
4094 // For c2: call to return a leased buffer.
4095 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
4096   enum layout {
4097     rbp_off,
4098     rbpH_off,
4099     return_off,
4100     return_off2,
4101     framesize // inclusive of return address
4102   };
4103 
4104   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id);
4105   CodeBuffer code(name, 1024, 64);
4106   MacroAssembler* masm = new MacroAssembler(&code);
4107   address start = __ pc();
4108 
4109   __ enter();
4110   address the_pc = __ pc();
4111 
4112   int frame_complete = the_pc - start;
4113 
4114   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
4115   __ movptr(c_rarg0, r15_thread);
4116   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
4117   __ reset_last_Java_frame(true);
4118 
4119   __ leave();
4120   __ ret(0);
4121 
4122   OopMapSet* oop_maps = new OopMapSet();
4123   OopMap* map = new OopMap(framesize, 1);
4124   oop_maps->add_gc_map(frame_complete, map);
4125 
4126   RuntimeStub* stub =
4127     RuntimeStub::new_runtime_stub(name,
4128                                   &code,
4129                                   frame_complete,
4130                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4131                                   oop_maps,
4132                                   false);
4133   return stub;
4134 }
4135 
4136 #endif // INCLUDE_JFR