1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "classfile/symbolTable.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/timerTrace.hpp"
  56 #include "runtime/vframeArray.hpp"
  57 #include "runtime/vm_version.hpp"
  58 #include "utilities/align.hpp"
  59 #include "utilities/checkedCast.hpp"
  60 #include "utilities/formatBuffer.hpp"
  61 #include "vmreg_x86.inline.hpp"
  62 #ifdef COMPILER1
  63 #include "c1/c1_Runtime1.hpp"
  64 #endif
  65 #ifdef COMPILER2
  66 #include "opto/runtime.hpp"
  67 #endif
  68 #if INCLUDE_JVMCI
  69 #include "jvmci/jvmciJavaClasses.hpp"
  70 #endif
  71 
  72 #define __ masm->
  73 
  74 #ifdef PRODUCT
  75 #define BLOCK_COMMENT(str) /* nothing */
  76 #else
  77 #define BLOCK_COMMENT(str) __ block_comment(str)
  78 #endif // PRODUCT
  79 
  80 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  81 
  82 class RegisterSaver {
  83   // Capture info about frame layout.  Layout offsets are in jint
  84   // units because compiler frame slots are jints.
  85 #define XSAVE_AREA_BEGIN 160
  86 #define XSAVE_AREA_YMM_BEGIN 576
  87 #define XSAVE_AREA_EGPRS 960
  88 #define XSAVE_AREA_OPMASK_BEGIN 1088
  89 #define XSAVE_AREA_ZMM_BEGIN 1152
  90 #define XSAVE_AREA_UPPERBANK 1664
  91 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  92 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  93 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  94 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  95 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  96   enum layout {
  97     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  98     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
  99     DEF_XMM_OFFS(0),
 100     DEF_XMM_OFFS(1),
 101     // 2..15 are implied in range usage
 102     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 103     DEF_YMM_OFFS(0),
 104     DEF_YMM_OFFS(1),
 105     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 106     r16H_off,
 107     r17_off, r17H_off,
 108     r18_off, r18H_off,
 109     r19_off, r19H_off,
 110     r20_off, r20H_off,
 111     r21_off, r21H_off,
 112     r22_off, r22H_off,
 113     r23_off, r23H_off,
 114     r24_off, r24H_off,
 115     r25_off, r25H_off,
 116     r26_off, r26H_off,
 117     r27_off, r27H_off,
 118     r28_off, r28H_off,
 119     r29_off, r29H_off,
 120     r30_off, r30H_off,
 121     r31_off, r31H_off,
 122     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 123     DEF_OPMASK_OFFS(0),
 124     DEF_OPMASK_OFFS(1),
 125     // 2..7 are implied in range usage
 126     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 127     DEF_ZMM_OFFS(0),
 128     DEF_ZMM_OFFS(1),
 129     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 130     DEF_ZMM_UPPER_OFFS(16),
 131     DEF_ZMM_UPPER_OFFS(17),
 132     // 18..31 are implied in range usage
 133     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 134     fpu_stateH_end,
 135     r15_off, r15H_off,
 136     r14_off, r14H_off,
 137     r13_off, r13H_off,
 138     r12_off, r12H_off,
 139     r11_off, r11H_off,
 140     r10_off, r10H_off,
 141     r9_off,  r9H_off,
 142     r8_off,  r8H_off,
 143     rdi_off, rdiH_off,
 144     rsi_off, rsiH_off,
 145     ignore_off, ignoreH_off,  // extra copy of rbp
 146     rsp_off, rspH_off,
 147     rbx_off, rbxH_off,
 148     rdx_off, rdxH_off,
 149     rcx_off, rcxH_off,
 150     rax_off, raxH_off,
 151     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 152     align_off, alignH_off,
 153     flags_off, flagsH_off,
 154     // The frame sender code expects that rbp will be in the "natural" place and
 155     // will override any oopMap setting for it. We must therefore force the layout
 156     // so that it agrees with the frame sender code.
 157     rbp_off, rbpH_off,        // copy of rbp we will restore
 158     return_off, returnH_off,  // slot for return address
 159     reg_save_size             // size in compiler stack slots
 160   };
 161 
 162  public:
 163   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 164   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 165 
 166   // Offsets into the register save area
 167   // Used by deoptimization when it is managing result register
 168   // values on its own
 169 
 170   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 171   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 172   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 173   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 174   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 175   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 176 
 177   // During deoptimization only the result registers need to be restored,
 178   // all the other values have already been extracted.
 179   static void restore_result_registers(MacroAssembler* masm);
 180 };
 181 
 182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 183   int off = 0;
 184   int num_xmm_regs = XMMRegister::available_xmm_registers();
 185 #if COMPILER2_OR_JVMCI
 186   if (save_wide_vectors && UseAVX == 0) {
 187     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 188   }
 189   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 190 #else
 191   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 192 #endif
 193 
 194   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 195   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 196   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 197   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 198   // CodeBlob frame size is in words.
 199   int frame_size_in_words = frame_size_in_bytes / wordSize;
 200   *total_frame_words = frame_size_in_words;
 201 
 202   // Save registers, fpu state, and flags.
 203   // We assume caller has already pushed the return address onto the
 204   // stack, so rsp is 8-byte aligned here.
 205   // We push rpb twice in this sequence because we want the real rbp
 206   // to be under the return like a normal enter.
 207 
 208   __ enter();          // rsp becomes 16-byte aligned here
 209   __ pushf();
 210   // Make sure rsp stays 16-byte aligned
 211   __ subq(rsp, 8);
 212   // Push CPU state in multiple of 16 bytes
 213   __ save_legacy_gprs();
 214   __ push_FPU_state();
 215 
 216 
 217   // push cpu state handles this on EVEX enabled targets
 218   if (save_wide_vectors) {
 219     // Save upper half of YMM registers(0..15)
 220     int base_addr = XSAVE_AREA_YMM_BEGIN;
 221     for (int n = 0; n < 16; n++) {
 222       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 223     }
 224     if (VM_Version::supports_evex()) {
 225       // Save upper half of ZMM registers(0..15)
 226       base_addr = XSAVE_AREA_ZMM_BEGIN;
 227       for (int n = 0; n < 16; n++) {
 228         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 229       }
 230       // Save full ZMM registers(16..num_xmm_regs)
 231       base_addr = XSAVE_AREA_UPPERBANK;
 232       off = 0;
 233       int vector_len = Assembler::AVX_512bit;
 234       for (int n = 16; n < num_xmm_regs; n++) {
 235         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 236       }
 237 #if COMPILER2_OR_JVMCI
 238       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 239       off = 0;
 240       for(int n = 0; n < KRegister::number_of_registers; n++) {
 241         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 242       }
 243 #endif
 244     }
 245   } else {
 246     if (VM_Version::supports_evex()) {
 247       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 248       int base_addr = XSAVE_AREA_UPPERBANK;
 249       off = 0;
 250       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 251       for (int n = 16; n < num_xmm_regs; n++) {
 252         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 253       }
 254 #if COMPILER2_OR_JVMCI
 255       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 256       off = 0;
 257       for(int n = 0; n < KRegister::number_of_registers; n++) {
 258         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 259       }
 260 #endif
 261     }
 262   }
 263 
 264 #if COMPILER2_OR_JVMCI
 265   if (UseAPX) {
 266       int base_addr = XSAVE_AREA_EGPRS;
 267       off = 0;
 268       for (int n = 16; n < Register::number_of_registers; n++) {
 269         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 270       }
 271   }
 272 #endif
 273 
 274   __ vzeroupper();
 275   if (frame::arg_reg_save_area_bytes != 0) {
 276     // Allocate argument register save area
 277     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 278   }
 279 
 280   // Set an oopmap for the call site.  This oopmap will map all
 281   // oop-registers and debug-info registers as callee-saved.  This
 282   // will allow deoptimization at this safepoint to find all possible
 283   // debug-info recordings, as well as let GC find all oops.
 284 
 285   OopMapSet *oop_maps = new OopMapSet();
 286   OopMap* map = new OopMap(frame_size_in_slots, 0);
 287 
 288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 289 
 290   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 291   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 294   // rbp location is known implicitly by the frame sender code, needs no oopmap
 295   // and the location where rbp was saved by is ignored
 296   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 297   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 306 
 307   if (UseAPX) {
 308     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 309     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 324   }
 325   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 326   // on EVEX enabled targets, we get it included in the xsave area
 327   off = xmm0_off;
 328   int delta = xmm1_off - off;
 329   for (int n = 0; n < 16; n++) {
 330     XMMRegister xmm_name = as_XMMRegister(n);
 331     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 332     off += delta;
 333   }
 334   if (UseAVX > 2) {
 335     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 336     off = zmm16_off;
 337     delta = zmm17_off - off;
 338     for (int n = 16; n < num_xmm_regs; n++) {
 339       XMMRegister zmm_name = as_XMMRegister(n);
 340       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 341       off += delta;
 342     }
 343   }
 344 
 345 #if COMPILER2_OR_JVMCI
 346   if (save_wide_vectors) {
 347     // Save upper half of YMM registers(0..15)
 348     off = ymm0_off;
 349     delta = ymm1_off - ymm0_off;
 350     for (int n = 0; n < 16; n++) {
 351       XMMRegister ymm_name = as_XMMRegister(n);
 352       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 353       off += delta;
 354     }
 355     if (VM_Version::supports_evex()) {
 356       // Save upper half of ZMM registers(0..15)
 357       off = zmm0_off;
 358       delta = zmm1_off - zmm0_off;
 359       for (int n = 0; n < 16; n++) {
 360         XMMRegister zmm_name = as_XMMRegister(n);
 361         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 362         off += delta;
 363       }
 364     }
 365   }
 366 #endif // COMPILER2_OR_JVMCI
 367 
 368   // %%% These should all be a waste but we'll keep things as they were for now
 369   if (true) {
 370     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 371     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 374     // rbp location is known implicitly by the frame sender code, needs no oopmap
 375     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 376     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 385     if (UseAPX) {
 386       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 387       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 402     }
 403     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 404     // on EVEX enabled targets, we get it included in the xsave area
 405     off = xmm0H_off;
 406     delta = xmm1H_off - off;
 407     for (int n = 0; n < 16; n++) {
 408       XMMRegister xmm_name = as_XMMRegister(n);
 409       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 410       off += delta;
 411     }
 412     if (UseAVX > 2) {
 413       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 414       off = zmm16H_off;
 415       delta = zmm17H_off - off;
 416       for (int n = 16; n < num_xmm_regs; n++) {
 417         XMMRegister zmm_name = as_XMMRegister(n);
 418         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 419         off += delta;
 420       }
 421     }
 422   }
 423 
 424   return map;
 425 }
 426 
 427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 428   int num_xmm_regs = XMMRegister::available_xmm_registers();
 429   if (frame::arg_reg_save_area_bytes != 0) {
 430     // Pop arg register save area
 431     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 432   }
 433 
 434 #if COMPILER2_OR_JVMCI
 435   if (restore_wide_vectors) {
 436     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 437     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 438   }
 439 #else
 440   assert(!restore_wide_vectors, "vectors are generated only by C2");
 441 #endif
 442 
 443   __ vzeroupper();
 444 
 445   // On EVEX enabled targets everything is handled in pop fpu state
 446   if (restore_wide_vectors) {
 447     // Restore upper half of YMM registers (0..15)
 448     int base_addr = XSAVE_AREA_YMM_BEGIN;
 449     for (int n = 0; n < 16; n++) {
 450       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 451     }
 452     if (VM_Version::supports_evex()) {
 453       // Restore upper half of ZMM registers (0..15)
 454       base_addr = XSAVE_AREA_ZMM_BEGIN;
 455       for (int n = 0; n < 16; n++) {
 456         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 457       }
 458       // Restore full ZMM registers(16..num_xmm_regs)
 459       base_addr = XSAVE_AREA_UPPERBANK;
 460       int vector_len = Assembler::AVX_512bit;
 461       int off = 0;
 462       for (int n = 16; n < num_xmm_regs; n++) {
 463         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 464       }
 465 #if COMPILER2_OR_JVMCI
 466       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 467       off = 0;
 468       for (int n = 0; n < KRegister::number_of_registers; n++) {
 469         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 470       }
 471 #endif
 472     }
 473   } else {
 474     if (VM_Version::supports_evex()) {
 475       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 476       int base_addr = XSAVE_AREA_UPPERBANK;
 477       int off = 0;
 478       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 479       for (int n = 16; n < num_xmm_regs; n++) {
 480         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 481       }
 482 #if COMPILER2_OR_JVMCI
 483       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 484       off = 0;
 485       for (int n = 0; n < KRegister::number_of_registers; n++) {
 486         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 487       }
 488 #endif
 489     }
 490   }
 491 
 492 #if COMPILER2_OR_JVMCI
 493   if (UseAPX) {
 494     int base_addr = XSAVE_AREA_EGPRS;
 495     int off = 0;
 496     for (int n = 16; n < Register::number_of_registers; n++) {
 497       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 498     }
 499   }
 500 #endif
 501 
 502   // Recover CPU state
 503   __ pop_FPU_state();
 504   __ restore_legacy_gprs();
 505   __ addq(rsp, 8);
 506   __ popf();
 507   // Get the rbp described implicitly by the calling convention (no oopMap)
 508   __ pop(rbp);
 509 }
 510 
 511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 512 
 513   // Just restore result register. Only used by deoptimization. By
 514   // now any callee save register that needs to be restored to a c2
 515   // caller of the deoptee has been extracted into the vframeArray
 516   // and will be stuffed into the c2i adapter we create for later
 517   // restoration so only result registers need to be restored here.
 518 
 519   // Restore fp result register
 520   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 521   // Restore integer result register
 522   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 523   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 524 
 525   // Pop all of the register save are off the stack except the return address
 526   __ addptr(rsp, return_offset_in_bytes());
 527 }
 528 
 529 // Is vector's size (in bytes) bigger than a size saved by default?
 530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 531 bool SharedRuntime::is_wide_vector(int size) {
 532   return size > 16;
 533 }
 534 
 535 // ---------------------------------------------------------------------------
 536 // Read the array of BasicTypes from a signature, and compute where the
 537 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 538 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 539 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 540 // as framesizes are fixed.
 541 // VMRegImpl::stack0 refers to the first slot 0(sp).
 542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 543 // Register up to Register::number_of_registers are the 64-bit
 544 // integer registers.
 545 
 546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 547 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 548 // units regardless of build. Of course for i486 there is no 64 bit build
 549 
 550 // The Java calling convention is a "shifted" version of the C ABI.
 551 // By skipping the first C ABI register we can call non-static jni methods
 552 // with small numbers of arguments without having to shuffle the arguments
 553 // at all. Since we control the java ABI we ought to at least get some
 554 // advantage out of it.
 555 
 556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 557                                            VMRegPair *regs,
 558                                            int total_args_passed) {
 559 
 560   // Create the mapping between argument positions and
 561   // registers.
 562   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 563     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 564   };
 565   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 566     j_farg0, j_farg1, j_farg2, j_farg3,
 567     j_farg4, j_farg5, j_farg6, j_farg7
 568   };
 569 
 570 
 571   uint int_args = 0;
 572   uint fp_args = 0;
 573   uint stk_args = 0;
 574 
 575   for (int i = 0; i < total_args_passed; i++) {
 576     switch (sig_bt[i]) {
 577     case T_BOOLEAN:
 578     case T_CHAR:
 579     case T_BYTE:
 580     case T_SHORT:
 581     case T_INT:
 582       if (int_args < Argument::n_int_register_parameters_j) {
 583         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 584       } else {
 585         stk_args = align_up(stk_args, 2);
 586         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 587         stk_args += 1;
 588       }
 589       break;
 590     case T_VOID:
 591       // halves of T_LONG or T_DOUBLE
 592       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 593       regs[i].set_bad();
 594       break;
 595     case T_LONG:
 596       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 597       // fall through
 598     case T_OBJECT:
 599     case T_ARRAY:
 600     case T_ADDRESS:
 601       if (int_args < Argument::n_int_register_parameters_j) {
 602         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 603       } else {
 604         stk_args = align_up(stk_args, 2);
 605         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 606         stk_args += 2;
 607       }
 608       break;
 609     case T_FLOAT:
 610       if (fp_args < Argument::n_float_register_parameters_j) {
 611         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 612       } else {
 613         stk_args = align_up(stk_args, 2);
 614         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 615         stk_args += 1;
 616       }
 617       break;
 618     case T_DOUBLE:
 619       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 620       if (fp_args < Argument::n_float_register_parameters_j) {
 621         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 622       } else {
 623         stk_args = align_up(stk_args, 2);
 624         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 625         stk_args += 2;
 626       }
 627       break;
 628     default:
 629       ShouldNotReachHere();
 630       break;
 631     }
 632   }
 633 
 634   return stk_args;
 635 }
 636 
 637 // Same as java_calling_convention() but for multiple return
 638 // values. There's no way to store them on the stack so if we don't
 639 // have enough registers, multiple values can't be returned.
 640 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 641 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 642 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 643                                           VMRegPair *regs,
 644                                           int total_args_passed) {
 645   // Create the mapping between argument positions and
 646   // registers.
 647   static const Register INT_ArgReg[java_return_convention_max_int] = {
 648     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 649   };
 650   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 651     j_farg0, j_farg1, j_farg2, j_farg3,
 652     j_farg4, j_farg5, j_farg6, j_farg7
 653   };
 654 
 655 
 656   uint int_args = 0;
 657   uint fp_args = 0;
 658 
 659   for (int i = 0; i < total_args_passed; i++) {
 660     switch (sig_bt[i]) {
 661     case T_BOOLEAN:
 662     case T_CHAR:
 663     case T_BYTE:
 664     case T_SHORT:
 665     case T_INT:
 666       if (int_args < Argument::n_int_register_parameters_j+1) {
 667         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 668         int_args++;
 669       } else {
 670         return -1;
 671       }
 672       break;
 673     case T_VOID:
 674       // halves of T_LONG or T_DOUBLE
 675       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 676       regs[i].set_bad();
 677       break;
 678     case T_LONG:
 679       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 680       // fall through
 681     case T_OBJECT:
 682     case T_ARRAY:
 683     case T_ADDRESS:
 684     case T_METADATA:
 685       if (int_args < Argument::n_int_register_parameters_j+1) {
 686         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 687         int_args++;
 688       } else {
 689         return -1;
 690       }
 691       break;
 692     case T_FLOAT:
 693       if (fp_args < Argument::n_float_register_parameters_j) {
 694         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 695         fp_args++;
 696       } else {
 697         return -1;
 698       }
 699       break;
 700     case T_DOUBLE:
 701       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 702       if (fp_args < Argument::n_float_register_parameters_j) {
 703         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 704         fp_args++;
 705       } else {
 706         return -1;
 707       }
 708       break;
 709     default:
 710       ShouldNotReachHere();
 711       break;
 712     }
 713   }
 714 
 715   return int_args + fp_args;
 716 }
 717 
 718 // Patch the callers callsite with entry to compiled code if it exists.
 719 static void patch_callers_callsite(MacroAssembler *masm) {
 720   Label L;
 721   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 722   __ jcc(Assembler::equal, L);
 723 
 724   // Save the current stack pointer
 725   __ mov(r13, rsp);
 726   // Schedule the branch target address early.
 727   // Call into the VM to patch the caller, then jump to compiled callee
 728   // rax isn't live so capture return address while we easily can
 729   __ movptr(rax, Address(rsp, 0));
 730 
 731   // align stack so push_CPU_state doesn't fault
 732   __ andptr(rsp, -(StackAlignmentInBytes));
 733   __ push_CPU_state();
 734   __ vzeroupper();
 735   // VM needs caller's callsite
 736   // VM needs target method
 737   // This needs to be a long call since we will relocate this adapter to
 738   // the codeBuffer and it may not reach
 739 
 740   // Allocate argument register save area
 741   if (frame::arg_reg_save_area_bytes != 0) {
 742     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 743   }
 744   __ mov(c_rarg0, rbx);
 745   __ mov(c_rarg1, rax);
 746   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 747 
 748   // De-allocate argument register save area
 749   if (frame::arg_reg_save_area_bytes != 0) {
 750     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 751   }
 752 
 753   __ vzeroupper();
 754   __ pop_CPU_state();
 755   // restore sp
 756   __ mov(rsp, r13);
 757   __ bind(L);
 758 }
 759 
 760 // For each inline type argument, sig includes the list of fields of
 761 // the inline type. This utility function computes the number of
 762 // arguments for the call if inline types are passed by reference (the
 763 // calling convention the interpreter expects).
 764 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 765   int total_args_passed = 0;
 766   if (InlineTypePassFieldsAsArgs) {
 767     for (int i = 0; i < sig_extended->length(); i++) {
 768       BasicType bt = sig_extended->at(i)._bt;
 769       if (bt == T_METADATA) {
 770         // In sig_extended, an inline type argument starts with:
 771         // T_METADATA, followed by the types of the fields of the
 772         // inline type and T_VOID to mark the end of the value
 773         // type. Inline types are flattened so, for instance, in the
 774         // case of an inline type with an int field and an inline type
 775         // field that itself has 2 fields, an int and a long:
 776         // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
 777         // slot for the T_LONG) T_VOID (inner inline type) T_VOID
 778         // (outer inline type)
 779         total_args_passed++;
 780         int vt = 1;
 781         do {
 782           i++;
 783           BasicType bt = sig_extended->at(i)._bt;
 784           BasicType prev_bt = sig_extended->at(i-1)._bt;
 785           if (bt == T_METADATA) {
 786             vt++;
 787           } else if (bt == T_VOID &&
 788                      prev_bt != T_LONG &&
 789                      prev_bt != T_DOUBLE) {
 790             vt--;
 791           }
 792         } while (vt != 0);
 793       } else {
 794         total_args_passed++;
 795       }
 796     }
 797   } else {
 798     total_args_passed = sig_extended->length();
 799   }
 800   return total_args_passed;
 801 }
 802 
 803 
 804 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 805                                    BasicType bt,
 806                                    BasicType prev_bt,
 807                                    size_t size_in_bytes,
 808                                    const VMRegPair& reg_pair,
 809                                    const Address& to,
 810                                    int extraspace,
 811                                    bool is_oop) {
 812   if (bt == T_VOID) {
 813     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 814     return;
 815   }
 816 
 817   // Say 4 args:
 818   // i   st_off
 819   // 0   32 T_LONG
 820   // 1   24 T_VOID
 821   // 2   16 T_OBJECT
 822   // 3    8 T_BOOL
 823   // -    0 return address
 824   //
 825   // However to make thing extra confusing. Because we can fit a long/double in
 826   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 827   // leaves one slot empty and only stores to a single slot. In this case the
 828   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 829 
 830   bool wide = (size_in_bytes == wordSize);
 831   VMReg r_1 = reg_pair.first();
 832   VMReg r_2 = reg_pair.second();
 833   assert(r_2->is_valid() == wide, "invalid size");
 834   if (!r_1->is_valid()) {
 835     assert(!r_2->is_valid(), "must be invalid");
 836     return;
 837   }
 838 
 839   if (!r_1->is_XMMRegister()) {
 840     Register val = rax;
 841     if (r_1->is_stack()) {
 842       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 843       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 844     } else {
 845       val = r_1->as_Register();
 846     }
 847     assert_different_registers(to.base(), val, rscratch1);
 848     if (is_oop) {
 849       __ push(r13);
 850       __ push(rbx);
 851       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 852       __ pop(rbx);
 853       __ pop(r13);
 854     } else {
 855       __ store_sized_value(to, val, size_in_bytes);
 856     }
 857   } else {
 858     if (wide) {
 859       __ movdbl(to, r_1->as_XMMRegister());
 860     } else {
 861       __ movflt(to, r_1->as_XMMRegister());
 862     }
 863   }
 864 }
 865 
 866 static void gen_c2i_adapter(MacroAssembler *masm,
 867                             const GrowableArray<SigEntry>* sig_extended,
 868                             const VMRegPair *regs,
 869                             bool requires_clinit_barrier,
 870                             address& c2i_no_clinit_check_entry,
 871                             Label& skip_fixup,
 872                             address start,
 873                             OopMapSet* oop_maps,
 874                             int& frame_complete,
 875                             int& frame_size_in_words,
 876                             bool alloc_inline_receiver) {
 877   if (requires_clinit_barrier && VM_Version::supports_fast_class_init_checks()) {
 878     Label L_skip_barrier;
 879     Register method = rbx;
 880 
 881     { // Bypass the barrier for non-static methods
 882       Register flags = rscratch1;
 883       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
 884       __ testl(flags, JVM_ACC_STATIC);
 885       __ jcc(Assembler::zero, L_skip_barrier); // non-static
 886     }
 887 
 888     Register klass = rscratch1;
 889     __ load_method_holder(klass, method);
 890     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
 891 
 892     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
 893 
 894     __ bind(L_skip_barrier);
 895     c2i_no_clinit_check_entry = __ pc();
 896   }
 897 
 898   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 899   bs->c2i_entry_barrier(masm);
 900 
 901   // Before we get into the guts of the C2I adapter, see if we should be here
 902   // at all.  We've come from compiled code and are attempting to jump to the
 903   // interpreter, which means the caller made a static call to get here
 904   // (vcalls always get a compiled target if there is one).  Check for a
 905   // compiled target.  If there is one, we need to patch the caller's call.
 906   patch_callers_callsite(masm);
 907 
 908   __ bind(skip_fixup);
 909 
 910   if (InlineTypePassFieldsAsArgs) {
 911     // Is there an inline type argument?
 912     bool has_inline_argument = false;
 913     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 914       has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
 915     }
 916     if (has_inline_argument) {
 917       // There is at least an inline type argument: we're coming from
 918       // compiled code so we have no buffers to back the inline types.
 919       // Allocate the buffers here with a runtime call.
 920       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
 921 
 922       frame_complete = __ offset();
 923 
 924       __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
 925 
 926       __ mov(c_rarg0, r15_thread);
 927       __ mov(c_rarg1, rbx);
 928       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 929       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 930 
 931       oop_maps->add_gc_map((int)(__ pc() - start), map);
 932       __ reset_last_Java_frame(false);
 933 
 934       RegisterSaver::restore_live_registers(masm);
 935 
 936       Label no_exception;
 937       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 938       __ jcc(Assembler::equal, no_exception);
 939 
 940       __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
 941       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 942       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 943 
 944       __ bind(no_exception);
 945 
 946       // We get an array of objects from the runtime call
 947       __ get_vm_result_oop(rscratch2); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 948       __ get_vm_result_metadata(rbx); // TODO: required to keep the callee Method live?
 949     }
 950   }
 951 
 952   // Since all args are passed on the stack, total_args_passed *
 953   // Interpreter::stackElementSize is the space we need.
 954   int total_args_passed = compute_total_args_passed_int(sig_extended);
 955   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 956 
 957   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 958 
 959   // stack is aligned, keep it that way
 960   // This is not currently needed or enforced by the interpreter, but
 961   // we might as well conform to the ABI.
 962   extraspace = align_up(extraspace, 2*wordSize);
 963 
 964   // set senderSP value
 965   __ lea(r13, Address(rsp, wordSize));
 966 
 967 #ifdef ASSERT
 968   __ check_stack_alignment(r13, "sender stack not aligned");
 969 #endif
 970   if (extraspace > 0) {
 971     // Pop the return address
 972     __ pop(rax);
 973 
 974     __ subptr(rsp, extraspace);
 975 
 976     // Push the return address
 977     __ push(rax);
 978 
 979     // Account for the return address location since we store it first rather
 980     // than hold it in a register across all the shuffling
 981     extraspace += wordSize;
 982   }
 983 
 984 #ifdef ASSERT
 985   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 986 #endif
 987 
 988   // Now write the args into the outgoing interpreter space
 989 
 990   // next_arg_comp is the next argument from the compiler point of
 991   // view (inline type fields are passed in registers/on the stack). In
 992   // sig_extended, an inline type argument starts with: T_METADATA,
 993   // followed by the types of the fields of the inline type and T_VOID
 994   // to mark the end of the inline type. ignored counts the number of
 995   // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
 996   // used to get the buffer for that argument from the pool of buffers
 997   // we allocated above and want to pass to the
 998   // interpreter. next_arg_int is the next argument from the
 999   // interpreter point of view (inline types are passed by reference).
1000   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
1001        next_arg_comp < sig_extended->length(); next_arg_comp++) {
1002     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
1003     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
1004     BasicType bt = sig_extended->at(next_arg_comp)._bt;
1005     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
1006     if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
1007       int next_off = st_off - Interpreter::stackElementSize;
1008       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
1009       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
1010       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
1011       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1012                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
1013       next_arg_int++;
1014 #ifdef ASSERT
1015       if (bt == T_LONG || bt == T_DOUBLE) {
1016         // Overwrite the unused slot with known junk
1017         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
1018         __ movptr(Address(rsp, st_off), rax);
1019       }
1020 #endif /* ASSERT */
1021     } else {
1022       ignored++;
1023       // get the buffer from the just allocated pool of buffers
1024       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
1025       __ load_heap_oop(r14, Address(rscratch2, index));
1026       next_vt_arg++; next_arg_int++;
1027       int vt = 1;
1028       // write fields we get from compiled code in registers/stack
1029       // slots to the buffer: we know we are done with that inline type
1030       // argument when we hit the T_VOID that acts as an end of inline
1031       // type delimiter for this inline type. Inline types are flattened
1032       // so we might encounter embedded inline types. Each entry in
1033       // sig_extended contains a field offset in the buffer.
1034       Label L_null;
1035       do {
1036         next_arg_comp++;
1037         BasicType bt = sig_extended->at(next_arg_comp)._bt;
1038         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
1039         if (bt == T_METADATA) {
1040           vt++;
1041           ignored++;
1042         } else if (bt == T_VOID &&
1043                    prev_bt != T_LONG &&
1044                    prev_bt != T_DOUBLE) {
1045           vt--;
1046           ignored++;
1047         } else {
1048           int off = sig_extended->at(next_arg_comp)._offset;
1049           if (off == -1) {
1050             // Nullable inline type argument, emit null check
1051             VMReg reg = regs[next_arg_comp-ignored].first();
1052             Label L_notNull;
1053             if (reg->is_stack()) {
1054               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1055               __ testb(Address(rsp, ld_off), 1);
1056             } else {
1057               __ testb(reg->as_Register(), 1);
1058             }
1059             __ jcc(Assembler::notZero, L_notNull);
1060             __ movptr(Address(rsp, st_off), 0);
1061             __ jmp(L_null);
1062             __ bind(L_notNull);
1063             continue;
1064           }
1065           assert(off > 0, "offset in object should be positive");
1066           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
1067           bool is_oop = is_reference_type(bt);
1068           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1069                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
1070         }
1071       } while (vt != 0);
1072       // pass the buffer to the interpreter
1073       __ movptr(Address(rsp, st_off), r14);
1074       __ bind(L_null);
1075     }
1076   }
1077 
1078   // Schedule the branch target address early.
1079   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1080   __ jmp(rcx);
1081 }
1082 
1083 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
1084                         address code_start, address code_end,
1085                         Label& L_ok) {
1086   Label L_fail;
1087   __ lea(temp_reg, AddressLiteral(code_start, relocInfo::none));
1088   __ cmpptr(pc_reg, temp_reg);
1089   __ jcc(Assembler::belowEqual, L_fail);
1090   __ lea(temp_reg, AddressLiteral(code_end, relocInfo::none));
1091   __ cmpptr(pc_reg, temp_reg);
1092   __ jcc(Assembler::below, L_ok);
1093   __ bind(L_fail);
1094 }
1095 
1096 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1097                                     int comp_args_on_stack,
1098                                     const GrowableArray<SigEntry>* sig,
1099                                     const VMRegPair *regs) {
1100 
1101   // Note: r13 contains the senderSP on entry. We must preserve it since
1102   // we may do a i2c -> c2i transition if we lose a race where compiled
1103   // code goes non-entrant while we get args ready.
1104   // In addition we use r13 to locate all the interpreter args as
1105   // we must align the stack to 16 bytes on an i2c entry else we
1106   // lose alignment we expect in all compiled code and register
1107   // save code can segv when fxsave instructions find improperly
1108   // aligned stack pointer.
1109 
1110   // Adapters can be frameless because they do not require the caller
1111   // to perform additional cleanup work, such as correcting the stack pointer.
1112   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1113   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1114   // even if a callee has modified the stack pointer.
1115   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1116   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1117   // up via the senderSP register).
1118   // In other words, if *either* the caller or callee is interpreted, we can
1119   // get the stack pointer repaired after a call.
1120   // This is why c2i and i2c adapters cannot be indefinitely composed.
1121   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1122   // both caller and callee would be compiled methods, and neither would
1123   // clean up the stack pointer changes performed by the two adapters.
1124   // If this happens, control eventually transfers back to the compiled
1125   // caller, but with an uncorrected stack, causing delayed havoc.
1126 
1127   if (VerifyAdapterCalls &&
1128       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
1129     // So, let's test for cascading c2i/i2c adapters right now.
1130     //  assert(Interpreter::contains($return_addr) ||
1131     //         StubRoutines::contains($return_addr),
1132     //         "i2c adapter must return to an interpreter frame");
1133     __ block_comment("verify_i2c { ");
1134     // Pick up the return address
1135     __ movptr(rax, Address(rsp, 0));
1136     Label L_ok;
1137     if (Interpreter::code() != nullptr) {
1138       range_check(masm, rax, r11,
1139                   Interpreter::code()->code_start(),
1140                   Interpreter::code()->code_end(),
1141                   L_ok);
1142     }
1143     if (StubRoutines::initial_stubs_code() != nullptr) {
1144       range_check(masm, rax, r11,
1145                   StubRoutines::initial_stubs_code()->code_begin(),
1146                   StubRoutines::initial_stubs_code()->code_end(),
1147                   L_ok);
1148     }
1149     if (StubRoutines::final_stubs_code() != nullptr) {
1150       range_check(masm, rax, r11,
1151                   StubRoutines::final_stubs_code()->code_begin(),
1152                   StubRoutines::final_stubs_code()->code_end(),
1153                   L_ok);
1154     }
1155     const char* msg = "i2c adapter must return to an interpreter frame";
1156     __ block_comment(msg);
1157     __ stop(msg);
1158     __ bind(L_ok);
1159     __ block_comment("} verify_i2ce ");
1160   }
1161 
1162   // Must preserve original SP for loading incoming arguments because
1163   // we need to align the outgoing SP for compiled code.
1164   __ movptr(r11, rsp);
1165 
1166   // Pick up the return address
1167   __ pop(rax);
1168 
1169   // Convert 4-byte c2 stack slots to words.
1170   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1171 
1172   if (comp_args_on_stack) {
1173     __ subptr(rsp, comp_words_on_stack * wordSize);
1174   }
1175 
1176   // Ensure compiled code always sees stack at proper alignment
1177   __ andptr(rsp, -16);
1178 
1179   // push the return address and misalign the stack that youngest frame always sees
1180   // as far as the placement of the call instruction
1181   __ push(rax);
1182 
1183   // Put saved SP in another register
1184   const Register saved_sp = rax;
1185   __ movptr(saved_sp, r11);
1186 
1187   // Will jump to the compiled code just as if compiled code was doing it.
1188   // Pre-load the register-jump target early, to schedule it better.
1189   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1190 
1191 #if INCLUDE_JVMCI
1192   if (EnableJVMCI) {
1193     // check if this call should be routed towards a specific entry point
1194     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1195     Label no_alternative_target;
1196     __ jcc(Assembler::equal, no_alternative_target);
1197     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1198     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1199     __ bind(no_alternative_target);
1200   }
1201 #endif // INCLUDE_JVMCI
1202 
1203   int total_args_passed = sig->length();
1204 
1205   // Now generate the shuffle code.  Pick up all register args and move the
1206   // rest through the floating point stack top.
1207   for (int i = 0; i < total_args_passed; i++) {
1208     BasicType bt = sig->at(i)._bt;
1209     if (bt == T_VOID) {
1210       // Longs and doubles are passed in native word order, but misaligned
1211       // in the 32-bit build.
1212       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1213       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1214       continue;
1215     }
1216 
1217     // Pick up 0, 1 or 2 words from SP+offset.
1218 
1219     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1220             "scrambled load targets?");
1221     // Load in argument order going down.
1222     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1223     // Point to interpreter value (vs. tag)
1224     int next_off = ld_off - Interpreter::stackElementSize;
1225     //
1226     //
1227     //
1228     VMReg r_1 = regs[i].first();
1229     VMReg r_2 = regs[i].second();
1230     if (!r_1->is_valid()) {
1231       assert(!r_2->is_valid(), "");
1232       continue;
1233     }
1234     if (r_1->is_stack()) {
1235       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1236       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1237 
1238       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1239       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1240       // will be generated.
1241       if (!r_2->is_valid()) {
1242         // sign extend???
1243         __ movl(r13, Address(saved_sp, ld_off));
1244         __ movptr(Address(rsp, st_off), r13);
1245       } else {
1246         //
1247         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1248         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1249         // So we must adjust where to pick up the data to match the interpreter.
1250         //
1251         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1252         // are accessed as negative so LSW is at LOW address
1253 
1254         // ld_off is MSW so get LSW
1255         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1256                            next_off : ld_off;
1257         __ movq(r13, Address(saved_sp, offset));
1258         // st_off is LSW (i.e. reg.first())
1259         __ movq(Address(rsp, st_off), r13);
1260       }
1261     } else if (r_1->is_Register()) {  // Register argument
1262       Register r = r_1->as_Register();
1263       assert(r != rax, "must be different");
1264       if (r_2->is_valid()) {
1265         //
1266         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1267         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1268         // So we must adjust where to pick up the data to match the interpreter.
1269 
1270         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1271                            next_off : ld_off;
1272 
1273         // this can be a misaligned move
1274         __ movq(r, Address(saved_sp, offset));
1275       } else {
1276         // sign extend and use a full word?
1277         __ movl(r, Address(saved_sp, ld_off));
1278       }
1279     } else {
1280       if (!r_2->is_valid()) {
1281         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1282       } else {
1283         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1284       }
1285     }
1286   }
1287 
1288   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1289 
1290   // 6243940 We might end up in handle_wrong_method if
1291   // the callee is deoptimized as we race thru here. If that
1292   // happens we don't want to take a safepoint because the
1293   // caller frame will look interpreted and arguments are now
1294   // "compiled" so it is much better to make this transition
1295   // invisible to the stack walking code. Unfortunately if
1296   // we try and find the callee by normal means a safepoint
1297   // is possible. So we stash the desired callee in the thread
1298   // and the vm will find there should this case occur.
1299 
1300   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1301 
1302   // put Method* where a c2i would expect should we end up there
1303   // only needed because of c2 resolve stubs return Method* as a result in
1304   // rax
1305   __ mov(rax, rbx);
1306   __ jmp(r11);
1307 }
1308 
1309 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1310   Register data = rax;
1311   __ ic_check(1 /* end_alignment */);
1312   __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1313 
1314   // Method might have been compiled since the call site was patched to
1315   // interpreted if that is the case treat it as a miss so we can get
1316   // the call site corrected.
1317   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1318   __ jcc(Assembler::equal, skip_fixup);
1319   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1320 }
1321 
1322 // ---------------------------------------------------------------
1323 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1324                                                             int comp_args_on_stack,
1325                                                             const GrowableArray<SigEntry>* sig,
1326                                                             const VMRegPair* regs,
1327                                                             const GrowableArray<SigEntry>* sig_cc,
1328                                                             const VMRegPair* regs_cc,
1329                                                             const GrowableArray<SigEntry>* sig_cc_ro,
1330                                                             const VMRegPair* regs_cc_ro,
1331                                                             AdapterFingerPrint* fingerprint,
1332                                                             AdapterBlob*& new_adapter,
1333                                                             bool allocate_code_blob) {
1334   address i2c_entry = __ pc();
1335   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1336 
1337   // -------------------------------------------------------------------------
1338   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1339   // to the interpreter.  The args start out packed in the compiled layout.  They
1340   // need to be unpacked into the interpreter layout.  This will almost always
1341   // require some stack space.  We grow the current (compiled) stack, then repack
1342   // the args.  We  finally end in a jump to the generic interpreter entry point.
1343   // On exit from the interpreter, the interpreter will restore our SP (lest the
1344   // compiled code, which relies solely on SP and not RBP, get sick).
1345 
1346   address c2i_unverified_entry        = __ pc();
1347   address c2i_unverified_inline_entry = __ pc();
1348   Label skip_fixup;
1349 
1350   gen_inline_cache_check(masm, skip_fixup);
1351 
1352   OopMapSet* oop_maps = new OopMapSet();
1353   int frame_complete = CodeOffsets::frame_never_safe;
1354   int frame_size_in_words = 0;
1355 
1356   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1357   address c2i_no_clinit_check_entry = nullptr;
1358   address c2i_inline_ro_entry = __ pc();
1359   if (regs_cc != regs_cc_ro) {
1360     // No class init barrier needed because method is guaranteed to be non-static
1361     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, c2i_no_clinit_check_entry,
1362                     skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1363     skip_fixup.reset();
1364   }
1365 
1366   // Scalarized c2i adapter
1367   address c2i_entry        = __ pc();
1368   address c2i_inline_entry = __ pc();
1369   gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1370                   skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1371 
1372   // Non-scalarized c2i adapter
1373   if (regs != regs_cc) {
1374     c2i_unverified_inline_entry = __ pc();
1375     Label inline_entry_skip_fixup;
1376     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1377 
1378     c2i_inline_entry = __ pc();
1379     gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1380                     inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1381   }
1382 
1383   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1384   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1385   if (allocate_code_blob) {
1386     bool caller_must_gc_arguments = (regs != regs_cc);
1387     new_adapter = AdapterBlob::create(masm->code(), frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1388   }
1389 
1390   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry, c2i_unverified_inline_entry, c2i_no_clinit_check_entry);
1391 }
1392 
1393 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1394                                          VMRegPair *regs,
1395                                          int total_args_passed) {
1396 
1397 // We return the amount of VMRegImpl stack slots we need to reserve for all
1398 // the arguments NOT counting out_preserve_stack_slots.
1399 
1400 // NOTE: These arrays will have to change when c1 is ported
1401 #ifdef _WIN64
1402     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1403       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1404     };
1405     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1406       c_farg0, c_farg1, c_farg2, c_farg3
1407     };
1408 #else
1409     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1410       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1411     };
1412     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1413       c_farg0, c_farg1, c_farg2, c_farg3,
1414       c_farg4, c_farg5, c_farg6, c_farg7
1415     };
1416 #endif // _WIN64
1417 
1418 
1419     uint int_args = 0;
1420     uint fp_args = 0;
1421     uint stk_args = 0; // inc by 2 each time
1422 
1423     for (int i = 0; i < total_args_passed; i++) {
1424       switch (sig_bt[i]) {
1425       case T_BOOLEAN:
1426       case T_CHAR:
1427       case T_BYTE:
1428       case T_SHORT:
1429       case T_INT:
1430         if (int_args < Argument::n_int_register_parameters_c) {
1431           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1432 #ifdef _WIN64
1433           fp_args++;
1434           // Allocate slots for callee to stuff register args the stack.
1435           stk_args += 2;
1436 #endif
1437         } else {
1438           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1439           stk_args += 2;
1440         }
1441         break;
1442       case T_LONG:
1443         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1444         // fall through
1445       case T_OBJECT:
1446       case T_ARRAY:
1447       case T_ADDRESS:
1448       case T_METADATA:
1449         if (int_args < Argument::n_int_register_parameters_c) {
1450           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1451 #ifdef _WIN64
1452           fp_args++;
1453           stk_args += 2;
1454 #endif
1455         } else {
1456           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1457           stk_args += 2;
1458         }
1459         break;
1460       case T_FLOAT:
1461         if (fp_args < Argument::n_float_register_parameters_c) {
1462           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1463 #ifdef _WIN64
1464           int_args++;
1465           // Allocate slots for callee to stuff register args the stack.
1466           stk_args += 2;
1467 #endif
1468         } else {
1469           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1470           stk_args += 2;
1471         }
1472         break;
1473       case T_DOUBLE:
1474         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1475         if (fp_args < Argument::n_float_register_parameters_c) {
1476           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1477 #ifdef _WIN64
1478           int_args++;
1479           // Allocate slots for callee to stuff register args the stack.
1480           stk_args += 2;
1481 #endif
1482         } else {
1483           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1484           stk_args += 2;
1485         }
1486         break;
1487       case T_VOID: // Halves of longs and doubles
1488         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1489         regs[i].set_bad();
1490         break;
1491       default:
1492         ShouldNotReachHere();
1493         break;
1494       }
1495     }
1496 #ifdef _WIN64
1497   // windows abi requires that we always allocate enough stack space
1498   // for 4 64bit registers to be stored down.
1499   if (stk_args < 8) {
1500     stk_args = 8;
1501   }
1502 #endif // _WIN64
1503 
1504   return stk_args;
1505 }
1506 
1507 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1508                                              uint num_bits,
1509                                              uint total_args_passed) {
1510   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1511          "only certain vector sizes are supported for now");
1512 
1513   static const XMMRegister VEC_ArgReg[32] = {
1514      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1515      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1516     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1517     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1518   };
1519 
1520   uint stk_args = 0;
1521   uint fp_args = 0;
1522 
1523   for (uint i = 0; i < total_args_passed; i++) {
1524     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1525     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1526     regs[i].set_pair(vmreg->next(next_val), vmreg);
1527   }
1528 
1529   return stk_args;
1530 }
1531 
1532 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1533   // We always ignore the frame_slots arg and just use the space just below frame pointer
1534   // which by this time is free to use
1535   switch (ret_type) {
1536   case T_FLOAT:
1537     __ movflt(Address(rbp, -wordSize), xmm0);
1538     break;
1539   case T_DOUBLE:
1540     __ movdbl(Address(rbp, -wordSize), xmm0);
1541     break;
1542   case T_VOID:  break;
1543   default: {
1544     __ movptr(Address(rbp, -wordSize), rax);
1545     }
1546   }
1547 }
1548 
1549 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1550   // We always ignore the frame_slots arg and just use the space just below frame pointer
1551   // which by this time is free to use
1552   switch (ret_type) {
1553   case T_FLOAT:
1554     __ movflt(xmm0, Address(rbp, -wordSize));
1555     break;
1556   case T_DOUBLE:
1557     __ movdbl(xmm0, Address(rbp, -wordSize));
1558     break;
1559   case T_VOID:  break;
1560   default: {
1561     __ movptr(rax, Address(rbp, -wordSize));
1562     }
1563   }
1564 }
1565 
1566 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1567     for ( int i = first_arg ; i < arg_count ; i++ ) {
1568       if (args[i].first()->is_Register()) {
1569         __ push(args[i].first()->as_Register());
1570       } else if (args[i].first()->is_XMMRegister()) {
1571         __ subptr(rsp, 2*wordSize);
1572         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1573       }
1574     }
1575 }
1576 
1577 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1578     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1579       if (args[i].first()->is_Register()) {
1580         __ pop(args[i].first()->as_Register());
1581       } else if (args[i].first()->is_XMMRegister()) {
1582         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1583         __ addptr(rsp, 2*wordSize);
1584       }
1585     }
1586 }
1587 
1588 static void verify_oop_args(MacroAssembler* masm,
1589                             const methodHandle& method,
1590                             const BasicType* sig_bt,
1591                             const VMRegPair* regs) {
1592   Register temp_reg = rbx;  // not part of any compiled calling seq
1593   if (VerifyOops) {
1594     for (int i = 0; i < method->size_of_parameters(); i++) {
1595       if (is_reference_type(sig_bt[i])) {
1596         VMReg r = regs[i].first();
1597         assert(r->is_valid(), "bad oop arg");
1598         if (r->is_stack()) {
1599           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1600           __ verify_oop(temp_reg);
1601         } else {
1602           __ verify_oop(r->as_Register());
1603         }
1604       }
1605     }
1606   }
1607 }
1608 
1609 static void check_continuation_enter_argument(VMReg actual_vmreg,
1610                                               Register expected_reg,
1611                                               const char* name) {
1612   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1613   assert(actual_vmreg->as_Register() == expected_reg,
1614          "%s is in unexpected register: %s instead of %s",
1615          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1616 }
1617 
1618 
1619 //---------------------------- continuation_enter_setup ---------------------------
1620 //
1621 // Arguments:
1622 //   None.
1623 //
1624 // Results:
1625 //   rsp: pointer to blank ContinuationEntry
1626 //
1627 // Kills:
1628 //   rax
1629 //
1630 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1631   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1632   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1633   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1634 
1635   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1636   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1637 
1638   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1639   OopMap* map = new OopMap(frame_size, 0);
1640 
1641   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1642   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1643   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1644 
1645   return map;
1646 }
1647 
1648 //---------------------------- fill_continuation_entry ---------------------------
1649 //
1650 // Arguments:
1651 //   rsp: pointer to blank Continuation entry
1652 //   reg_cont_obj: pointer to the continuation
1653 //   reg_flags: flags
1654 //
1655 // Results:
1656 //   rsp: pointer to filled out ContinuationEntry
1657 //
1658 // Kills:
1659 //   rax
1660 //
1661 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1662   assert_different_registers(rax, reg_cont_obj, reg_flags);
1663 #ifdef ASSERT
1664   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1665 #endif
1666   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1667   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1668   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1669   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1670   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1671 
1672   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1673   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1674   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1675   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1676 
1677   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1678   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1679 }
1680 
1681 //---------------------------- continuation_enter_cleanup ---------------------------
1682 //
1683 // Arguments:
1684 //   rsp: pointer to the ContinuationEntry
1685 //
1686 // Results:
1687 //   rsp: pointer to the spilled rbp in the entry frame
1688 //
1689 // Kills:
1690 //   rbx
1691 //
1692 static void continuation_enter_cleanup(MacroAssembler* masm) {
1693 #ifdef ASSERT
1694   Label L_good_sp;
1695   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1696   __ jcc(Assembler::equal, L_good_sp);
1697   __ stop("Incorrect rsp at continuation_enter_cleanup");
1698   __ bind(L_good_sp);
1699 #endif
1700   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1701   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1702 
1703   if (CheckJNICalls) {
1704     // Check if this is a virtual thread continuation
1705     Label L_skip_vthread_code;
1706     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1707     __ jcc(Assembler::equal, L_skip_vthread_code);
1708 
1709     // If the held monitor count is > 0 and this vthread is terminating then
1710     // it failed to release a JNI monitor. So we issue the same log message
1711     // that JavaThread::exit does.
1712     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1713     __ jcc(Assembler::equal, L_skip_vthread_code);
1714 
1715     // rax may hold an exception oop, save it before the call
1716     __ push(rax);
1717     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1718     __ pop(rax);
1719 
1720     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1721     // on termination. The held count is implicitly zeroed below when we restore from
1722     // the parent held count (which has to be zero).
1723     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1724 
1725     __ bind(L_skip_vthread_code);
1726   }
1727 #ifdef ASSERT
1728   else {
1729     // Check if this is a virtual thread continuation
1730     Label L_skip_vthread_code;
1731     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1732     __ jcc(Assembler::equal, L_skip_vthread_code);
1733 
1734     // See comment just above. If not checking JNI calls the JNI count is only
1735     // needed for assertion checking.
1736     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1737 
1738     __ bind(L_skip_vthread_code);
1739   }
1740 #endif
1741 
1742   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1743   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1744 
1745   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1746   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1747   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1748 }
1749 
1750 static void gen_continuation_enter(MacroAssembler* masm,
1751                                    const VMRegPair* regs,
1752                                    int& exception_offset,
1753                                    OopMapSet* oop_maps,
1754                                    int& frame_complete,
1755                                    int& stack_slots,
1756                                    int& interpreted_entry_offset,
1757                                    int& compiled_entry_offset) {
1758 
1759   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1760   int pos_cont_obj   = 0;
1761   int pos_is_cont    = 1;
1762   int pos_is_virtual = 2;
1763 
1764   // The platform-specific calling convention may present the arguments in various registers.
1765   // To simplify the rest of the code, we expect the arguments to reside at these known
1766   // registers, and we additionally check the placement here in case calling convention ever
1767   // changes.
1768   Register reg_cont_obj   = c_rarg1;
1769   Register reg_is_cont    = c_rarg2;
1770   Register reg_is_virtual = c_rarg3;
1771 
1772   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1773   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1774   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1775 
1776   // Utility methods kill rax, make sure there are no collisions
1777   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1778 
1779   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1780                          relocInfo::static_call_type);
1781 
1782   address start = __ pc();
1783 
1784   Label L_thaw, L_exit;
1785 
1786   // i2i entry used at interp_only_mode only
1787   interpreted_entry_offset = __ pc() - start;
1788   {
1789 #ifdef ASSERT
1790     Label is_interp_only;
1791     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1792     __ jcc(Assembler::notEqual, is_interp_only);
1793     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1794     __ bind(is_interp_only);
1795 #endif
1796 
1797     __ pop(rax); // return address
1798     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1799     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1800     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1801     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1802     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1803     __ push(rax); // return address
1804     __ push_cont_fastpath();
1805 
1806     __ enter();
1807 
1808     stack_slots = 2; // will be adjusted in setup
1809     OopMap* map = continuation_enter_setup(masm, stack_slots);
1810     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1811     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1812 
1813     __ verify_oop(reg_cont_obj);
1814 
1815     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1816 
1817     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1818     __ testptr(reg_is_cont, reg_is_cont);
1819     __ jcc(Assembler::notZero, L_thaw);
1820 
1821     // --- Resolve path
1822 
1823     // Make sure the call is patchable
1824     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1825     // Emit stub for static call
1826     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1827     if (stub == nullptr) {
1828       fatal("CodeCache is full at gen_continuation_enter");
1829     }
1830     __ call(resolve);
1831     oop_maps->add_gc_map(__ pc() - start, map);
1832     __ post_call_nop();
1833 
1834     __ jmp(L_exit);
1835   }
1836 
1837   // compiled entry
1838   __ align(CodeEntryAlignment);
1839   compiled_entry_offset = __ pc() - start;
1840   __ enter();
1841 
1842   stack_slots = 2; // will be adjusted in setup
1843   OopMap* map = continuation_enter_setup(masm, stack_slots);
1844 
1845   // Frame is now completed as far as size and linkage.
1846   frame_complete = __ pc() - start;
1847 
1848   __ verify_oop(reg_cont_obj);
1849 
1850   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1851 
1852   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1853   __ testptr(reg_is_cont, reg_is_cont);
1854   __ jccb(Assembler::notZero, L_thaw);
1855 
1856   // --- call Continuation.enter(Continuation c, boolean isContinue)
1857 
1858   // Make sure the call is patchable
1859   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1860 
1861   // Emit stub for static call
1862   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1863   if (stub == nullptr) {
1864     fatal("CodeCache is full at gen_continuation_enter");
1865   }
1866 
1867   // The call needs to be resolved. There's a special case for this in
1868   // SharedRuntime::find_callee_info_helper() which calls
1869   // LinkResolver::resolve_continuation_enter() which resolves the call to
1870   // Continuation.enter(Continuation c, boolean isContinue).
1871   __ call(resolve);
1872 
1873   oop_maps->add_gc_map(__ pc() - start, map);
1874   __ post_call_nop();
1875 
1876   __ jmpb(L_exit);
1877 
1878   // --- Thawing path
1879 
1880   __ bind(L_thaw);
1881 
1882   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1883   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1884 
1885   ContinuationEntry::_return_pc_offset = __ pc() - start;
1886   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1887   __ post_call_nop();
1888 
1889   // --- Normal exit (resolve/thawing)
1890 
1891   __ bind(L_exit);
1892   ContinuationEntry::_cleanup_offset = __ pc() - start;
1893   continuation_enter_cleanup(masm);
1894   __ pop(rbp);
1895   __ ret(0);
1896 
1897   // --- Exception handling path
1898 
1899   exception_offset = __ pc() - start;
1900 
1901   continuation_enter_cleanup(masm);
1902   __ pop(rbp);
1903 
1904   __ movptr(c_rarg0, r15_thread);
1905   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1906 
1907   // rax still holds the original exception oop, save it before the call
1908   __ push(rax);
1909 
1910   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1911   __ movptr(rbx, rax);
1912 
1913   // Continue at exception handler:
1914   //   rax: exception oop
1915   //   rbx: exception handler
1916   //   rdx: exception pc
1917   __ pop(rax);
1918   __ verify_oop(rax);
1919   __ pop(rdx);
1920   __ jmp(rbx);
1921 }
1922 
1923 static void gen_continuation_yield(MacroAssembler* masm,
1924                                    const VMRegPair* regs,
1925                                    OopMapSet* oop_maps,
1926                                    int& frame_complete,
1927                                    int& stack_slots,
1928                                    int& compiled_entry_offset) {
1929   enum layout {
1930     rbp_off,
1931     rbpH_off,
1932     return_off,
1933     return_off2,
1934     framesize // inclusive of return address
1935   };
1936   stack_slots = framesize /  VMRegImpl::slots_per_word;
1937   assert(stack_slots == 2, "recheck layout");
1938 
1939   address start = __ pc();
1940   compiled_entry_offset = __ pc() - start;
1941   __ enter();
1942   address the_pc = __ pc();
1943 
1944   frame_complete = the_pc - start;
1945 
1946   // This nop must be exactly at the PC we push into the frame info.
1947   // We use this nop for fast CodeBlob lookup, associate the OopMap
1948   // with it right away.
1949   __ post_call_nop();
1950   OopMap* map = new OopMap(framesize, 1);
1951   oop_maps->add_gc_map(frame_complete, map);
1952 
1953   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1954   __ movptr(c_rarg0, r15_thread);
1955   __ movptr(c_rarg1, rsp);
1956   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1957   __ reset_last_Java_frame(true);
1958 
1959   Label L_pinned;
1960 
1961   __ testptr(rax, rax);
1962   __ jcc(Assembler::notZero, L_pinned);
1963 
1964   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1965   continuation_enter_cleanup(masm);
1966   __ pop(rbp);
1967   __ ret(0);
1968 
1969   __ bind(L_pinned);
1970 
1971   // Pinned, return to caller
1972 
1973   // handle pending exception thrown by freeze
1974   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1975   Label ok;
1976   __ jcc(Assembler::equal, ok);
1977   __ leave();
1978   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1979   __ bind(ok);
1980 
1981   __ leave();
1982   __ ret(0);
1983 }
1984 
1985 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1986   ::continuation_enter_cleanup(masm);
1987 }
1988 
1989 static void gen_special_dispatch(MacroAssembler* masm,
1990                                  const methodHandle& method,
1991                                  const BasicType* sig_bt,
1992                                  const VMRegPair* regs) {
1993   verify_oop_args(masm, method, sig_bt, regs);
1994   vmIntrinsics::ID iid = method->intrinsic_id();
1995 
1996   // Now write the args into the outgoing interpreter space
1997   bool     has_receiver   = false;
1998   Register receiver_reg   = noreg;
1999   int      member_arg_pos = -1;
2000   Register member_reg     = noreg;
2001   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
2002   if (ref_kind != 0) {
2003     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
2004     member_reg = rbx;  // known to be free at this point
2005     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
2006   } else if (iid == vmIntrinsics::_invokeBasic) {
2007     has_receiver = true;
2008   } else if (iid == vmIntrinsics::_linkToNative) {
2009     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
2010     member_reg = rbx;  // known to be free at this point
2011   } else {
2012     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
2013   }
2014 
2015   if (member_reg != noreg) {
2016     // Load the member_arg into register, if necessary.
2017     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
2018     VMReg r = regs[member_arg_pos].first();
2019     if (r->is_stack()) {
2020       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
2021     } else {
2022       // no data motion is needed
2023       member_reg = r->as_Register();
2024     }
2025   }
2026 
2027   if (has_receiver) {
2028     // Make sure the receiver is loaded into a register.
2029     assert(method->size_of_parameters() > 0, "oob");
2030     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
2031     VMReg r = regs[0].first();
2032     assert(r->is_valid(), "bad receiver arg");
2033     if (r->is_stack()) {
2034       // Porting note:  This assumes that compiled calling conventions always
2035       // pass the receiver oop in a register.  If this is not true on some
2036       // platform, pick a temp and load the receiver from stack.
2037       fatal("receiver always in a register");
2038       receiver_reg = j_rarg0;  // known to be free at this point
2039       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
2040     } else {
2041       // no data motion is needed
2042       receiver_reg = r->as_Register();
2043     }
2044   }
2045 
2046   // Figure out which address we are really jumping to:
2047   MethodHandles::generate_method_handle_dispatch(masm, iid,
2048                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
2049 }
2050 
2051 // ---------------------------------------------------------------------------
2052 // Generate a native wrapper for a given method.  The method takes arguments
2053 // in the Java compiled code convention, marshals them to the native
2054 // convention (handlizes oops, etc), transitions to native, makes the call,
2055 // returns to java state (possibly blocking), unhandlizes any result and
2056 // returns.
2057 //
2058 // Critical native functions are a shorthand for the use of
2059 // GetPrimtiveArrayCritical and disallow the use of any other JNI
2060 // functions.  The wrapper is expected to unpack the arguments before
2061 // passing them to the callee. Critical native functions leave the state _in_Java,
2062 // since they cannot stop for GC.
2063 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
2064 // block and the check for pending exceptions it's impossible for them
2065 // to be thrown.
2066 //
2067 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
2068                                                 const methodHandle& method,
2069                                                 int compile_id,
2070                                                 BasicType* in_sig_bt,
2071                                                 VMRegPair* in_regs,
2072                                                 BasicType ret_type) {
2073   if (method->is_continuation_native_intrinsic()) {
2074     int exception_offset = -1;
2075     OopMapSet* oop_maps = new OopMapSet();
2076     int frame_complete = -1;
2077     int stack_slots = -1;
2078     int interpreted_entry_offset = -1;
2079     int vep_offset = -1;
2080     if (method->is_continuation_enter_intrinsic()) {
2081       gen_continuation_enter(masm,
2082                              in_regs,
2083                              exception_offset,
2084                              oop_maps,
2085                              frame_complete,
2086                              stack_slots,
2087                              interpreted_entry_offset,
2088                              vep_offset);
2089     } else if (method->is_continuation_yield_intrinsic()) {
2090       gen_continuation_yield(masm,
2091                              in_regs,
2092                              oop_maps,
2093                              frame_complete,
2094                              stack_slots,
2095                              vep_offset);
2096     } else {
2097       guarantee(false, "Unknown Continuation native intrinsic");
2098     }
2099 
2100 #ifdef ASSERT
2101     if (method->is_continuation_enter_intrinsic()) {
2102       assert(interpreted_entry_offset != -1, "Must be set");
2103       assert(exception_offset != -1,         "Must be set");
2104     } else {
2105       assert(interpreted_entry_offset == -1, "Must be unset");
2106       assert(exception_offset == -1,         "Must be unset");
2107     }
2108     assert(frame_complete != -1,    "Must be set");
2109     assert(stack_slots != -1,       "Must be set");
2110     assert(vep_offset != -1,        "Must be set");
2111 #endif
2112 
2113     __ flush();
2114     nmethod* nm = nmethod::new_native_nmethod(method,
2115                                               compile_id,
2116                                               masm->code(),
2117                                               vep_offset,
2118                                               frame_complete,
2119                                               stack_slots,
2120                                               in_ByteSize(-1),
2121                                               in_ByteSize(-1),
2122                                               oop_maps,
2123                                               exception_offset);
2124     if (nm == nullptr) return nm;
2125     if (method->is_continuation_enter_intrinsic()) {
2126       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2127     } else if (method->is_continuation_yield_intrinsic()) {
2128       _cont_doYield_stub = nm;
2129     }
2130     return nm;
2131   }
2132 
2133   if (method->is_method_handle_intrinsic()) {
2134     vmIntrinsics::ID iid = method->intrinsic_id();
2135     intptr_t start = (intptr_t)__ pc();
2136     int vep_offset = ((intptr_t)__ pc()) - start;
2137     gen_special_dispatch(masm,
2138                          method,
2139                          in_sig_bt,
2140                          in_regs);
2141     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
2142     __ flush();
2143     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
2144     return nmethod::new_native_nmethod(method,
2145                                        compile_id,
2146                                        masm->code(),
2147                                        vep_offset,
2148                                        frame_complete,
2149                                        stack_slots / VMRegImpl::slots_per_word,
2150                                        in_ByteSize(-1),
2151                                        in_ByteSize(-1),
2152                                        nullptr);
2153   }
2154   address native_func = method->native_function();
2155   assert(native_func != nullptr, "must have function");
2156 
2157   // An OopMap for lock (and class if static)
2158   OopMapSet *oop_maps = new OopMapSet();
2159   intptr_t start = (intptr_t)__ pc();
2160 
2161   // We have received a description of where all the java arg are located
2162   // on entry to the wrapper. We need to convert these args to where
2163   // the jni function will expect them. To figure out where they go
2164   // we convert the java signature to a C signature by inserting
2165   // the hidden arguments as arg[0] and possibly arg[1] (static method)
2166 
2167   const int total_in_args = method->size_of_parameters();
2168   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2169 
2170   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2171   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2172 
2173   int argc = 0;
2174   out_sig_bt[argc++] = T_ADDRESS;
2175   if (method->is_static()) {
2176     out_sig_bt[argc++] = T_OBJECT;
2177   }
2178 
2179   for (int i = 0; i < total_in_args ; i++ ) {
2180     out_sig_bt[argc++] = in_sig_bt[i];
2181   }
2182 
2183   // Now figure out where the args must be stored and how much stack space
2184   // they require.
2185   int out_arg_slots;
2186   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2187 
2188   // Compute framesize for the wrapper.  We need to handlize all oops in
2189   // incoming registers
2190 
2191   // Calculate the total number of stack slots we will need.
2192 
2193   // First count the abi requirement plus all of the outgoing args
2194   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2195 
2196   // Now the space for the inbound oop handle area
2197   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
2198 
2199   int oop_handle_offset = stack_slots;
2200   stack_slots += total_save_slots;
2201 
2202   // Now any space we need for handlizing a klass if static method
2203 
2204   int klass_slot_offset = 0;
2205   int klass_offset = -1;
2206   int lock_slot_offset = 0;
2207   bool is_static = false;
2208 
2209   if (method->is_static()) {
2210     klass_slot_offset = stack_slots;
2211     stack_slots += VMRegImpl::slots_per_word;
2212     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2213     is_static = true;
2214   }
2215 
2216   // Plus a lock if needed
2217 
2218   if (method->is_synchronized()) {
2219     lock_slot_offset = stack_slots;
2220     stack_slots += VMRegImpl::slots_per_word;
2221   }
2222 
2223   // Now a place (+2) to save return values or temp during shuffling
2224   // + 4 for return address (which we own) and saved rbp
2225   stack_slots += 6;
2226 
2227   // Ok The space we have allocated will look like:
2228   //
2229   //
2230   // FP-> |                     |
2231   //      |---------------------|
2232   //      | 2 slots for moves   |
2233   //      |---------------------|
2234   //      | lock box (if sync)  |
2235   //      |---------------------| <- lock_slot_offset
2236   //      | klass (if static)   |
2237   //      |---------------------| <- klass_slot_offset
2238   //      | oopHandle area      |
2239   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2240   //      | outbound memory     |
2241   //      | based arguments     |
2242   //      |                     |
2243   //      |---------------------|
2244   //      |                     |
2245   // SP-> | out_preserved_slots |
2246   //
2247   //
2248 
2249 
2250   // Now compute actual number of stack words we need rounding to make
2251   // stack properly aligned.
2252   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2253 
2254   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2255 
2256   // First thing make an ic check to see if we should even be here
2257 
2258   // We are free to use all registers as temps without saving them and
2259   // restoring them except rbp. rbp is the only callee save register
2260   // as far as the interpreter and the compiler(s) are concerned.
2261 
2262   const Register receiver = j_rarg0;
2263 
2264   Label exception_pending;
2265 
2266   assert_different_registers(receiver, rscratch1, rscratch2);
2267   __ verify_oop(receiver);
2268   __ ic_check(8 /* end_alignment */);
2269 
2270   int vep_offset = ((intptr_t)__ pc()) - start;
2271 
2272   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2273     Label L_skip_barrier;
2274     Register klass = r10;
2275     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2276     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
2277 
2278     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2279 
2280     __ bind(L_skip_barrier);
2281   }
2282 
2283 #ifdef COMPILER1
2284   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2285   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2286     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2287   }
2288 #endif // COMPILER1
2289 
2290   // The instruction at the verified entry point must be 5 bytes or longer
2291   // because it can be patched on the fly by make_non_entrant. The stack bang
2292   // instruction fits that requirement.
2293 
2294   // Generate stack overflow check
2295   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2296 
2297   // Generate a new frame for the wrapper.
2298   __ enter();
2299   // -2 because return address is already present and so is saved rbp
2300   __ subptr(rsp, stack_size - 2*wordSize);
2301 
2302   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2303   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2304   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2305 
2306   // Frame is now completed as far as size and linkage.
2307   int frame_complete = ((intptr_t)__ pc()) - start;
2308 
2309 #ifdef ASSERT
2310   __ check_stack_alignment(rsp, "improperly aligned stack");
2311 #endif /* ASSERT */
2312 
2313 
2314   // We use r14 as the oop handle for the receiver/klass
2315   // It is callee save so it survives the call to native
2316 
2317   const Register oop_handle_reg = r14;
2318 
2319   //
2320   // We immediately shuffle the arguments so that any vm call we have to
2321   // make from here on out (sync slow path, jvmti, etc.) we will have
2322   // captured the oops from our caller and have a valid oopMap for
2323   // them.
2324 
2325   // -----------------
2326   // The Grand Shuffle
2327 
2328   // The Java calling convention is either equal (linux) or denser (win64) than the
2329   // c calling convention. However the because of the jni_env argument the c calling
2330   // convention always has at least one more (and two for static) arguments than Java.
2331   // Therefore if we move the args from java -> c backwards then we will never have
2332   // a register->register conflict and we don't have to build a dependency graph
2333   // and figure out how to break any cycles.
2334   //
2335 
2336   // Record esp-based slot for receiver on stack for non-static methods
2337   int receiver_offset = -1;
2338 
2339   // This is a trick. We double the stack slots so we can claim
2340   // the oops in the caller's frame. Since we are sure to have
2341   // more args than the caller doubling is enough to make
2342   // sure we can capture all the incoming oop args from the
2343   // caller.
2344   //
2345   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2346 
2347   // Mark location of rbp (someday)
2348   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2349 
2350   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2351   // All inbound args are referenced based on rbp and all outbound args via rsp.
2352 
2353 
2354 #ifdef ASSERT
2355   bool reg_destroyed[Register::number_of_registers];
2356   bool freg_destroyed[XMMRegister::number_of_registers];
2357   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2358     reg_destroyed[r] = false;
2359   }
2360   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2361     freg_destroyed[f] = false;
2362   }
2363 
2364 #endif /* ASSERT */
2365 
2366   // For JNI natives the incoming and outgoing registers are offset upwards.
2367   GrowableArray<int> arg_order(2 * total_in_args);
2368 
2369   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2370     arg_order.push(i);
2371     arg_order.push(c_arg);
2372   }
2373 
2374   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2375     int i = arg_order.at(ai);
2376     int c_arg = arg_order.at(ai + 1);
2377     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2378 #ifdef ASSERT
2379     if (in_regs[i].first()->is_Register()) {
2380       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2381     } else if (in_regs[i].first()->is_XMMRegister()) {
2382       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2383     }
2384     if (out_regs[c_arg].first()->is_Register()) {
2385       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2386     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2387       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2388     }
2389 #endif /* ASSERT */
2390     switch (in_sig_bt[i]) {
2391       case T_ARRAY:
2392       case T_OBJECT:
2393         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2394                     ((i == 0) && (!is_static)),
2395                     &receiver_offset);
2396         break;
2397       case T_VOID:
2398         break;
2399 
2400       case T_FLOAT:
2401         __ float_move(in_regs[i], out_regs[c_arg]);
2402           break;
2403 
2404       case T_DOUBLE:
2405         assert( i + 1 < total_in_args &&
2406                 in_sig_bt[i + 1] == T_VOID &&
2407                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2408         __ double_move(in_regs[i], out_regs[c_arg]);
2409         break;
2410 
2411       case T_LONG :
2412         __ long_move(in_regs[i], out_regs[c_arg]);
2413         break;
2414 
2415       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2416 
2417       default:
2418         __ move32_64(in_regs[i], out_regs[c_arg]);
2419     }
2420   }
2421 
2422   int c_arg;
2423 
2424   // Pre-load a static method's oop into r14.  Used both by locking code and
2425   // the normal JNI call code.
2426   // point c_arg at the first arg that is already loaded in case we
2427   // need to spill before we call out
2428   c_arg = total_c_args - total_in_args;
2429 
2430   if (method->is_static()) {
2431 
2432     //  load oop into a register
2433     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2434 
2435     // Now handlize the static class mirror it's known not-null.
2436     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2437     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2438 
2439     // Now get the handle
2440     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2441     // store the klass handle as second argument
2442     __ movptr(c_rarg1, oop_handle_reg);
2443     // and protect the arg if we must spill
2444     c_arg--;
2445   }
2446 
2447   // Change state to native (we save the return address in the thread, since it might not
2448   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2449   // points into the right code segment. It does not have to be the correct return pc.
2450   // We use the same pc/oopMap repeatedly when we call out
2451 
2452   Label native_return;
2453   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2454     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2455     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2456   } else {
2457     intptr_t the_pc = (intptr_t) __ pc();
2458     oop_maps->add_gc_map(the_pc - start, map);
2459 
2460     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2461   }
2462 
2463   // We have all of the arguments setup at this point. We must not touch any register
2464   // argument registers at this point (what if we save/restore them there are no oop?
2465 
2466   if (DTraceMethodProbes) {
2467     // protect the args we've loaded
2468     save_args(masm, total_c_args, c_arg, out_regs);
2469     __ mov_metadata(c_rarg1, method());
2470     __ call_VM_leaf(
2471       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2472       r15_thread, c_rarg1);
2473     restore_args(masm, total_c_args, c_arg, out_regs);
2474   }
2475 
2476   // RedefineClasses() tracing support for obsolete method entry
2477   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2478     // protect the args we've loaded
2479     save_args(masm, total_c_args, c_arg, out_regs);
2480     __ mov_metadata(c_rarg1, method());
2481     __ call_VM_leaf(
2482       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2483       r15_thread, c_rarg1);
2484     restore_args(masm, total_c_args, c_arg, out_regs);
2485   }
2486 
2487   // Lock a synchronized method
2488 
2489   // Register definitions used by locking and unlocking
2490 
2491   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2492   const Register obj_reg  = rbx;  // Will contain the oop
2493   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2494   const Register old_hdr  = r13;  // value of old header at unlock time
2495 
2496   Label slow_path_lock;
2497   Label lock_done;
2498 
2499   if (method->is_synchronized()) {
2500     Label count_mon;
2501 
2502     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2503 
2504     // Get the handle (the 2nd argument)
2505     __ mov(oop_handle_reg, c_rarg1);
2506 
2507     // Get address of the box
2508 
2509     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2510 
2511     // Load the oop from the handle
2512     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2513 
2514     if (LockingMode == LM_MONITOR) {
2515       __ jmp(slow_path_lock);
2516     } else if (LockingMode == LM_LEGACY) {
2517       // Load immediate 1 into swap_reg %rax
2518       __ movl(swap_reg, 1);
2519 
2520       // Load (object->mark() | 1) into swap_reg %rax
2521       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2522       if (EnableValhalla) {
2523         // Mask inline_type bit such that we go to the slow path if object is an inline type
2524         __ andptr(swap_reg, ~((int) markWord::inline_type_bit_in_place));
2525       }
2526 
2527       // Save (object->mark() | 1) into BasicLock's displaced header
2528       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2529 
2530       // src -> dest iff dest == rax else rax <- dest
2531       __ lock();
2532       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2533       __ jcc(Assembler::equal, count_mon);
2534 
2535       // Hmm should this move to the slow path code area???
2536 
2537       // Test if the oopMark is an obvious stack pointer, i.e.,
2538       //  1) (mark & 3) == 0, and
2539       //  2) rsp <= mark < mark + os::pagesize()
2540       // These 3 tests can be done by evaluating the following
2541       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2542       // assuming both stack pointer and pagesize have their
2543       // least significant 2 bits clear.
2544       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2545 
2546       __ subptr(swap_reg, rsp);
2547       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2548 
2549       // Save the test result, for recursive case, the result is zero
2550       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2551       __ jcc(Assembler::notEqual, slow_path_lock);
2552 
2553       __ bind(count_mon);
2554       __ inc_held_monitor_count();
2555     } else {
2556       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2557       __ lightweight_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2558     }
2559 
2560     // Slow path will re-enter here
2561     __ bind(lock_done);
2562   }
2563 
2564   // Finally just about ready to make the JNI call
2565 
2566   // get JNIEnv* which is first argument to native
2567   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2568 
2569   // Now set thread in native
2570   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2571 
2572   __ call(RuntimeAddress(native_func));
2573 
2574   // Verify or restore cpu control state after JNI call
2575   __ restore_cpu_control_state_after_jni(rscratch1);
2576 
2577   // Unpack native results.
2578   switch (ret_type) {
2579   case T_BOOLEAN: __ c2bool(rax);            break;
2580   case T_CHAR   : __ movzwl(rax, rax);      break;
2581   case T_BYTE   : __ sign_extend_byte (rax); break;
2582   case T_SHORT  : __ sign_extend_short(rax); break;
2583   case T_INT    : /* nothing to do */        break;
2584   case T_DOUBLE :
2585   case T_FLOAT  :
2586     // Result is in xmm0 we'll save as needed
2587     break;
2588   case T_ARRAY:                 // Really a handle
2589   case T_OBJECT:                // Really a handle
2590       break; // can't de-handlize until after safepoint check
2591   case T_VOID: break;
2592   case T_LONG: break;
2593   default       : ShouldNotReachHere();
2594   }
2595 
2596   // Switch thread to "native transition" state before reading the synchronization state.
2597   // This additional state is necessary because reading and testing the synchronization
2598   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2599   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2600   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2601   //     Thread A is resumed to finish this native method, but doesn't block here since it
2602   //     didn't see any synchronization is progress, and escapes.
2603   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2604 
2605   // Force this write out before the read below
2606   if (!UseSystemMemoryBarrier) {
2607     __ membar(Assembler::Membar_mask_bits(
2608               Assembler::LoadLoad | Assembler::LoadStore |
2609               Assembler::StoreLoad | Assembler::StoreStore));
2610   }
2611 
2612   // check for safepoint operation in progress and/or pending suspend requests
2613   {
2614     Label Continue;
2615     Label slow_path;
2616 
2617     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2618 
2619     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2620     __ jcc(Assembler::equal, Continue);
2621     __ bind(slow_path);
2622 
2623     // Don't use call_VM as it will see a possible pending exception and forward it
2624     // and never return here preventing us from clearing _last_native_pc down below.
2625     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2626     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2627     // by hand.
2628     //
2629     __ vzeroupper();
2630     save_native_result(masm, ret_type, stack_slots);
2631     __ mov(c_rarg0, r15_thread);
2632     __ mov(r12, rsp); // remember sp
2633     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2634     __ andptr(rsp, -16); // align stack as required by ABI
2635     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2636     __ mov(rsp, r12); // restore sp
2637     __ reinit_heapbase();
2638     // Restore any method result value
2639     restore_native_result(masm, ret_type, stack_slots);
2640     __ bind(Continue);
2641   }
2642 
2643   // change thread state
2644   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2645 
2646   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2647     // Check preemption for Object.wait()
2648     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2649     __ cmpptr(rscratch1, NULL_WORD);
2650     __ jccb(Assembler::equal, native_return);
2651     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2652     __ jmp(rscratch1);
2653     __ bind(native_return);
2654 
2655     intptr_t the_pc = (intptr_t) __ pc();
2656     oop_maps->add_gc_map(the_pc - start, map);
2657   }
2658 
2659 
2660   Label reguard;
2661   Label reguard_done;
2662   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2663   __ jcc(Assembler::equal, reguard);
2664   __ bind(reguard_done);
2665 
2666   // native result if any is live
2667 
2668   // Unlock
2669   Label slow_path_unlock;
2670   Label unlock_done;
2671   if (method->is_synchronized()) {
2672 
2673     Label fast_done;
2674 
2675     // Get locked oop from the handle we passed to jni
2676     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2677 
2678     if (LockingMode == LM_LEGACY) {
2679       Label not_recur;
2680       // Simple recursive lock?
2681       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2682       __ jcc(Assembler::notEqual, not_recur);
2683       __ dec_held_monitor_count();
2684       __ jmpb(fast_done);
2685       __ bind(not_recur);
2686     }
2687 
2688     // Must save rax if it is live now because cmpxchg must use it
2689     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2690       save_native_result(masm, ret_type, stack_slots);
2691     }
2692 
2693     if (LockingMode == LM_MONITOR) {
2694       __ jmp(slow_path_unlock);
2695     } else if (LockingMode == LM_LEGACY) {
2696       // get address of the stack lock
2697       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2698       //  get old displaced header
2699       __ movptr(old_hdr, Address(rax, 0));
2700 
2701       // Atomic swap old header if oop still contains the stack lock
2702       __ lock();
2703       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2704       __ jcc(Assembler::notEqual, slow_path_unlock);
2705       __ dec_held_monitor_count();
2706     } else {
2707       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2708       __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2709     }
2710 
2711     // slow path re-enters here
2712     __ bind(unlock_done);
2713     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2714       restore_native_result(masm, ret_type, stack_slots);
2715     }
2716 
2717     __ bind(fast_done);
2718   }
2719   if (DTraceMethodProbes) {
2720     save_native_result(masm, ret_type, stack_slots);
2721     __ mov_metadata(c_rarg1, method());
2722     __ call_VM_leaf(
2723          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2724          r15_thread, c_rarg1);
2725     restore_native_result(masm, ret_type, stack_slots);
2726   }
2727 
2728   __ reset_last_Java_frame(false);
2729 
2730   // Unbox oop result, e.g. JNIHandles::resolve value.
2731   if (is_reference_type(ret_type)) {
2732     __ resolve_jobject(rax /* value */,
2733                        rcx /* tmp */);
2734   }
2735 
2736   if (CheckJNICalls) {
2737     // clear_pending_jni_exception_check
2738     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2739   }
2740 
2741   // reset handle block
2742   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2743   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2744 
2745   // pop our frame
2746 
2747   __ leave();
2748 
2749   // Any exception pending?
2750   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2751   __ jcc(Assembler::notEqual, exception_pending);
2752 
2753   // Return
2754 
2755   __ ret(0);
2756 
2757   // Unexpected paths are out of line and go here
2758 
2759   // forward the exception
2760   __ bind(exception_pending);
2761 
2762   // and forward the exception
2763   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2764 
2765   // Slow path locking & unlocking
2766   if (method->is_synchronized()) {
2767 
2768     // BEGIN Slow path lock
2769     __ bind(slow_path_lock);
2770 
2771     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2772     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2773 
2774     // protect the args we've loaded
2775     save_args(masm, total_c_args, c_arg, out_regs);
2776 
2777     __ mov(c_rarg0, obj_reg);
2778     __ mov(c_rarg1, lock_reg);
2779     __ mov(c_rarg2, r15_thread);
2780 
2781     // Not a leaf but we have last_Java_frame setup as we want.
2782     // We don't want to unmount in case of contention since that would complicate preserving
2783     // the arguments that had already been marshalled into the native convention. So we force
2784     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2785     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2786     __ push_cont_fastpath();
2787     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2788     __ pop_cont_fastpath();
2789     restore_args(masm, total_c_args, c_arg, out_regs);
2790 
2791 #ifdef ASSERT
2792     { Label L;
2793     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2794     __ jcc(Assembler::equal, L);
2795     __ stop("no pending exception allowed on exit from monitorenter");
2796     __ bind(L);
2797     }
2798 #endif
2799     __ jmp(lock_done);
2800 
2801     // END Slow path lock
2802 
2803     // BEGIN Slow path unlock
2804     __ bind(slow_path_unlock);
2805 
2806     // If we haven't already saved the native result we must save it now as xmm registers
2807     // are still exposed.
2808     __ vzeroupper();
2809     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2810       save_native_result(masm, ret_type, stack_slots);
2811     }
2812 
2813     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2814 
2815     __ mov(c_rarg0, obj_reg);
2816     __ mov(c_rarg2, r15_thread);
2817     __ mov(r12, rsp); // remember sp
2818     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2819     __ andptr(rsp, -16); // align stack as required by ABI
2820 
2821     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2822     // NOTE that obj_reg == rbx currently
2823     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2824     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2825 
2826     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2827     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2828     __ mov(rsp, r12); // restore sp
2829     __ reinit_heapbase();
2830 #ifdef ASSERT
2831     {
2832       Label L;
2833       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2834       __ jcc(Assembler::equal, L);
2835       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2836       __ bind(L);
2837     }
2838 #endif /* ASSERT */
2839 
2840     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2841 
2842     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2843       restore_native_result(masm, ret_type, stack_slots);
2844     }
2845     __ jmp(unlock_done);
2846 
2847     // END Slow path unlock
2848 
2849   } // synchronized
2850 
2851   // SLOW PATH Reguard the stack if needed
2852 
2853   __ bind(reguard);
2854   __ vzeroupper();
2855   save_native_result(masm, ret_type, stack_slots);
2856   __ mov(r12, rsp); // remember sp
2857   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2858   __ andptr(rsp, -16); // align stack as required by ABI
2859   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2860   __ mov(rsp, r12); // restore sp
2861   __ reinit_heapbase();
2862   restore_native_result(masm, ret_type, stack_slots);
2863   // and continue
2864   __ jmp(reguard_done);
2865 
2866 
2867 
2868   __ flush();
2869 
2870   nmethod *nm = nmethod::new_native_nmethod(method,
2871                                             compile_id,
2872                                             masm->code(),
2873                                             vep_offset,
2874                                             frame_complete,
2875                                             stack_slots / VMRegImpl::slots_per_word,
2876                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2877                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2878                                             oop_maps);
2879 
2880   return nm;
2881 }
2882 
2883 // this function returns the adjust size (in number of words) to a c2i adapter
2884 // activation for use during deoptimization
2885 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2886   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2887 }
2888 
2889 
2890 uint SharedRuntime::out_preserve_stack_slots() {
2891   return 0;
2892 }
2893 
2894 
2895 // Number of stack slots between incoming argument block and the start of
2896 // a new frame.  The PROLOG must add this many slots to the stack.  The
2897 // EPILOG must remove this many slots.  amd64 needs two slots for
2898 // return address.
2899 uint SharedRuntime::in_preserve_stack_slots() {
2900   return 4 + 2 * VerifyStackAtCalls;
2901 }
2902 
2903 VMReg SharedRuntime::thread_register() {
2904   return r15_thread->as_VMReg();
2905 }
2906 
2907 //------------------------------generate_deopt_blob----------------------------
2908 void SharedRuntime::generate_deopt_blob() {
2909   // Allocate space for the code
2910   ResourceMark rm;
2911   // Setup code generation tools
2912   int pad = 0;
2913   if (UseAVX > 2) {
2914     pad += 1024;
2915   }
2916   if (UseAPX) {
2917     pad += 1024;
2918   }
2919 #if INCLUDE_JVMCI
2920   if (EnableJVMCI) {
2921     pad += 512; // Increase the buffer size when compiling for JVMCI
2922   }
2923 #endif
2924   const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id);
2925   CodeBuffer buffer(name, 2560+pad, 1024);
2926   MacroAssembler* masm = new MacroAssembler(&buffer);
2927   int frame_size_in_words;
2928   OopMap* map = nullptr;
2929   OopMapSet *oop_maps = new OopMapSet();
2930 
2931   // -------------
2932   // This code enters when returning to a de-optimized nmethod.  A return
2933   // address has been pushed on the stack, and return values are in
2934   // registers.
2935   // If we are doing a normal deopt then we were called from the patched
2936   // nmethod from the point we returned to the nmethod. So the return
2937   // address on the stack is wrong by NativeCall::instruction_size
2938   // We will adjust the value so it looks like we have the original return
2939   // address on the stack (like when we eagerly deoptimized).
2940   // In the case of an exception pending when deoptimizing, we enter
2941   // with a return address on the stack that points after the call we patched
2942   // into the exception handler. We have the following register state from,
2943   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2944   //    rax: exception oop
2945   //    rbx: exception handler
2946   //    rdx: throwing pc
2947   // So in this case we simply jam rdx into the useless return address and
2948   // the stack looks just like we want.
2949   //
2950   // At this point we need to de-opt.  We save the argument return
2951   // registers.  We call the first C routine, fetch_unroll_info().  This
2952   // routine captures the return values and returns a structure which
2953   // describes the current frame size and the sizes of all replacement frames.
2954   // The current frame is compiled code and may contain many inlined
2955   // functions, each with their own JVM state.  We pop the current frame, then
2956   // push all the new frames.  Then we call the C routine unpack_frames() to
2957   // populate these frames.  Finally unpack_frames() returns us the new target
2958   // address.  Notice that callee-save registers are BLOWN here; they have
2959   // already been captured in the vframeArray at the time the return PC was
2960   // patched.
2961   address start = __ pc();
2962   Label cont;
2963 
2964   // Prolog for non exception case!
2965 
2966   // Save everything in sight.
2967   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2968 
2969   // Normal deoptimization.  Save exec mode for unpack_frames.
2970   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2971   __ jmp(cont);
2972 
2973   int reexecute_offset = __ pc() - start;
2974 #if INCLUDE_JVMCI && !defined(COMPILER1)
2975   if (UseJVMCICompiler) {
2976     // JVMCI does not use this kind of deoptimization
2977     __ should_not_reach_here();
2978   }
2979 #endif
2980 
2981   // Reexecute case
2982   // return address is the pc describes what bci to do re-execute at
2983 
2984   // No need to update map as each call to save_live_registers will produce identical oopmap
2985   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2986 
2987   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2988   __ jmp(cont);
2989 
2990 #if INCLUDE_JVMCI
2991   Label after_fetch_unroll_info_call;
2992   int implicit_exception_uncommon_trap_offset = 0;
2993   int uncommon_trap_offset = 0;
2994 
2995   if (EnableJVMCI) {
2996     implicit_exception_uncommon_trap_offset = __ pc() - start;
2997 
2998     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2999     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
3000 
3001     uncommon_trap_offset = __ pc() - start;
3002 
3003     // Save everything in sight.
3004     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
3005     // fetch_unroll_info needs to call last_java_frame()
3006     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3007 
3008     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
3009     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
3010 
3011     __ movl(r14, Deoptimization::Unpack_reexecute);
3012     __ mov(c_rarg0, r15_thread);
3013     __ movl(c_rarg2, r14); // exec mode
3014     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3015     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
3016 
3017     __ reset_last_Java_frame(false);
3018 
3019     __ jmp(after_fetch_unroll_info_call);
3020   } // EnableJVMCI
3021 #endif // INCLUDE_JVMCI
3022 
3023   int exception_offset = __ pc() - start;
3024 
3025   // Prolog for exception case
3026 
3027   // all registers are dead at this entry point, except for rax, and
3028   // rdx which contain the exception oop and exception pc
3029   // respectively.  Set them in TLS and fall thru to the
3030   // unpack_with_exception_in_tls entry point.
3031 
3032   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3033   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
3034 
3035   int exception_in_tls_offset = __ pc() - start;
3036 
3037   // new implementation because exception oop is now passed in JavaThread
3038 
3039   // Prolog for exception case
3040   // All registers must be preserved because they might be used by LinearScan
3041   // Exceptiop oop and throwing PC are passed in JavaThread
3042   // tos: stack at point of call to method that threw the exception (i.e. only
3043   // args are on the stack, no return address)
3044 
3045   // make room on stack for the return address
3046   // It will be patched later with the throwing pc. The correct value is not
3047   // available now because loading it from memory would destroy registers.
3048   __ push(0);
3049 
3050   // Save everything in sight.
3051   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
3052 
3053   // Now it is safe to overwrite any register
3054 
3055   // Deopt during an exception.  Save exec mode for unpack_frames.
3056   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
3057 
3058   // load throwing pc from JavaThread and patch it as the return address
3059   // of the current frame. Then clear the field in JavaThread
3060 
3061   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3062   __ movptr(Address(rbp, wordSize), rdx);
3063   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3064 
3065 #ifdef ASSERT
3066   // verify that there is really an exception oop in JavaThread
3067   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3068   __ verify_oop(rax);
3069 
3070   // verify that there is no pending exception
3071   Label no_pending_exception;
3072   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3073   __ testptr(rax, rax);
3074   __ jcc(Assembler::zero, no_pending_exception);
3075   __ stop("must not have pending exception here");
3076   __ bind(no_pending_exception);
3077 #endif
3078 
3079   __ bind(cont);
3080 
3081   // Call C code.  Need thread and this frame, but NOT official VM entry
3082   // crud.  We cannot block on this call, no GC can happen.
3083   //
3084   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
3085 
3086   // fetch_unroll_info needs to call last_java_frame().
3087 
3088   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3089 #ifdef ASSERT
3090   { Label L;
3091     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3092     __ jcc(Assembler::equal, L);
3093     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
3094     __ bind(L);
3095   }
3096 #endif // ASSERT
3097   __ mov(c_rarg0, r15_thread);
3098   __ movl(c_rarg1, r14); // exec_mode
3099   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
3100 
3101   // Need to have an oopmap that tells fetch_unroll_info where to
3102   // find any register it might need.
3103   oop_maps->add_gc_map(__ pc() - start, map);
3104 
3105   __ reset_last_Java_frame(false);
3106 
3107 #if INCLUDE_JVMCI
3108   if (EnableJVMCI) {
3109     __ bind(after_fetch_unroll_info_call);
3110   }
3111 #endif
3112 
3113   // Load UnrollBlock* into rdi
3114   __ mov(rdi, rax);
3115 
3116   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
3117    Label noException;
3118   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
3119   __ jcc(Assembler::notEqual, noException);
3120   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3121   // QQQ this is useless it was null above
3122   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3123   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3124   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3125 
3126   __ verify_oop(rax);
3127 
3128   // Overwrite the result registers with the exception results.
3129   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3130   // I think this is useless
3131   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
3132 
3133   __ bind(noException);
3134 
3135   // Only register save data is on the stack.
3136   // Now restore the result registers.  Everything else is either dead
3137   // or captured in the vframeArray.
3138   RegisterSaver::restore_result_registers(masm);
3139 
3140   // All of the register save area has been popped of the stack. Only the
3141   // return address remains.
3142 
3143   // Pop all the frames we must move/replace.
3144   //
3145   // Frame picture (youngest to oldest)
3146   // 1: self-frame (no frame link)
3147   // 2: deopting frame  (no frame link)
3148   // 3: caller of deopting frame (could be compiled/interpreted).
3149   //
3150   // Note: by leaving the return address of self-frame on the stack
3151   // and using the size of frame 2 to adjust the stack
3152   // when we are done the return to frame 3 will still be on the stack.
3153 
3154   // Pop deoptimized frame
3155   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3156   __ addptr(rsp, rcx);
3157 
3158   // rsp should be pointing at the return address to the caller (3)
3159 
3160   // Pick up the initial fp we should save
3161   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3162   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3163 
3164 #ifdef ASSERT
3165   // Compilers generate code that bang the stack by as much as the
3166   // interpreter would need. So this stack banging should never
3167   // trigger a fault. Verify that it does not on non product builds.
3168   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3169   __ bang_stack_size(rbx, rcx);
3170 #endif
3171 
3172   // Load address of array of frame pcs into rcx
3173   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3174 
3175   // Trash the old pc
3176   __ addptr(rsp, wordSize);
3177 
3178   // Load address of array of frame sizes into rsi
3179   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3180 
3181   // Load counter into rdx
3182   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3183 
3184   // Now adjust the caller's stack to make up for the extra locals
3185   // but record the original sp so that we can save it in the skeletal interpreter
3186   // frame and the stack walking of interpreter_sender will get the unextended sp
3187   // value and not the "real" sp value.
3188 
3189   const Register sender_sp = r8;
3190 
3191   __ mov(sender_sp, rsp);
3192   __ movl(rbx, Address(rdi,
3193                        Deoptimization::UnrollBlock::
3194                        caller_adjustment_offset()));
3195   __ subptr(rsp, rbx);
3196 
3197   // Push interpreter frames in a loop
3198   Label loop;
3199   __ bind(loop);
3200   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3201   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3202   __ pushptr(Address(rcx, 0));          // Save return address
3203   __ enter();                           // Save old & set new ebp
3204   __ subptr(rsp, rbx);                  // Prolog
3205   // This value is corrected by layout_activation_impl
3206   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3207   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3208   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3209   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3210   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3211   __ decrementl(rdx);                   // Decrement counter
3212   __ jcc(Assembler::notZero, loop);
3213   __ pushptr(Address(rcx, 0));          // Save final return address
3214 
3215   // Re-push self-frame
3216   __ enter();                           // Save old & set new ebp
3217 
3218   // Allocate a full sized register save area.
3219   // Return address and rbp are in place, so we allocate two less words.
3220   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3221 
3222   // Restore frame locals after moving the frame
3223   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3224   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3225 
3226   // Call C code.  Need thread but NOT official VM entry
3227   // crud.  We cannot block on this call, no GC can happen.  Call should
3228   // restore return values to their stack-slots with the new SP.
3229   //
3230   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3231 
3232   // Use rbp because the frames look interpreted now
3233   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3234   // Don't need the precise return PC here, just precise enough to point into this code blob.
3235   address the_pc = __ pc();
3236   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3237 
3238   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3239   __ mov(c_rarg0, r15_thread);
3240   __ movl(c_rarg1, r14); // second arg: exec_mode
3241   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3242   // Revert SP alignment after call since we're going to do some SP relative addressing below
3243   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3244 
3245   // Set an oopmap for the call site
3246   // Use the same PC we used for the last java frame
3247   oop_maps->add_gc_map(the_pc - start,
3248                        new OopMap( frame_size_in_words, 0 ));
3249 
3250   // Clear fp AND pc
3251   __ reset_last_Java_frame(true);
3252 
3253   // Collect return values
3254   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3255   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3256   // I think this is useless (throwing pc?)
3257   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3258 
3259   // Pop self-frame.
3260   __ leave();                           // Epilog
3261 
3262   // Jump to interpreter
3263   __ ret(0);
3264 
3265   // Make sure all code is generated
3266   masm->flush();
3267 
3268   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3269   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3270 #if INCLUDE_JVMCI
3271   if (EnableJVMCI) {
3272     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3273     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3274   }
3275 #endif
3276 }
3277 
3278 //------------------------------generate_handler_blob------
3279 //
3280 // Generate a special Compile2Runtime blob that saves all registers,
3281 // and setup oopmap.
3282 //
3283 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) {
3284   assert(StubRoutines::forward_exception_entry() != nullptr,
3285          "must be generated before");
3286   assert(is_polling_page_id(id), "expected a polling page stub id");
3287 
3288   ResourceMark rm;
3289   OopMapSet *oop_maps = new OopMapSet();
3290   OopMap* map;
3291 
3292   // Allocate space for the code.  Setup code generation tools.
3293   const char* name = SharedRuntime::stub_name(id);
3294   CodeBuffer buffer(name, 2548, 1024);
3295   MacroAssembler* masm = new MacroAssembler(&buffer);
3296 
3297   address start   = __ pc();
3298   address call_pc = nullptr;
3299   int frame_size_in_words;
3300   bool cause_return = (id == SharedStubId::polling_page_return_handler_id);
3301   bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id);
3302 
3303   // Make room for return address (or push it again)
3304   if (!cause_return) {
3305     __ push(rbx);
3306   }
3307 
3308   // Save registers, fpu state, and flags
3309   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3310 
3311   // The following is basically a call_VM.  However, we need the precise
3312   // address of the call in order to generate an oopmap. Hence, we do all the
3313   // work ourselves.
3314 
3315   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3316 
3317   // The return address must always be correct so that frame constructor never
3318   // sees an invalid pc.
3319 
3320   if (!cause_return) {
3321     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3322     // Additionally, rbx is a callee saved register and we can look at it later to determine
3323     // if someone changed the return address for us!
3324     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3325     __ movptr(Address(rbp, wordSize), rbx);
3326   }
3327 
3328   // Do the call
3329   __ mov(c_rarg0, r15_thread);
3330   __ call(RuntimeAddress(call_ptr));
3331 
3332   // Set an oopmap for the call site.  This oopmap will map all
3333   // oop-registers and debug-info registers as callee-saved.  This
3334   // will allow deoptimization at this safepoint to find all possible
3335   // debug-info recordings, as well as let GC find all oops.
3336 
3337   oop_maps->add_gc_map( __ pc() - start, map);
3338 
3339   Label noException;
3340 
3341   __ reset_last_Java_frame(false);
3342 
3343   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3344   __ jcc(Assembler::equal, noException);
3345 
3346   // Exception pending
3347 
3348   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3349 
3350   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3351 
3352   // No exception case
3353   __ bind(noException);
3354 
3355   Label no_adjust;
3356 #ifdef ASSERT
3357   Label bail;
3358 #endif
3359   if (!cause_return) {
3360     Label no_prefix, not_special, check_rex_prefix;
3361 
3362     // If our stashed return pc was modified by the runtime we avoid touching it
3363     __ cmpptr(rbx, Address(rbp, wordSize));
3364     __ jcc(Assembler::notEqual, no_adjust);
3365 
3366     // Skip over the poll instruction.
3367     // See NativeInstruction::is_safepoint_poll()
3368     // Possible encodings:
3369     //      85 00       test   %eax,(%rax)
3370     //      85 01       test   %eax,(%rcx)
3371     //      85 02       test   %eax,(%rdx)
3372     //      85 03       test   %eax,(%rbx)
3373     //      85 06       test   %eax,(%rsi)
3374     //      85 07       test   %eax,(%rdi)
3375     //
3376     //   41 85 00       test   %eax,(%r8)
3377     //   41 85 01       test   %eax,(%r9)
3378     //   41 85 02       test   %eax,(%r10)
3379     //   41 85 03       test   %eax,(%r11)
3380     //   41 85 06       test   %eax,(%r14)
3381     //   41 85 07       test   %eax,(%r15)
3382     //
3383     //      85 04 24    test   %eax,(%rsp)
3384     //   41 85 04 24    test   %eax,(%r12)
3385     //      85 45 00    test   %eax,0x0(%rbp)
3386     //   41 85 45 00    test   %eax,0x0(%r13)
3387     //
3388     // Notes:
3389     //  Format of legacy MAP0 test instruction:-
3390     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3391     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3392     //     operand and base register of memory operand is b/w [0-8), hence we do not require
3393     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3394     //     is why two bytes encoding is sufficient here.
3395     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3396     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
3397     //     there by adding additional byte to instruction encoding.
3398     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
3399     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3400     //     most significant two bits of 5 bit register encoding.
3401 
3402     if (VM_Version::supports_apx_f()) {
3403       __ cmpb(Address(rbx, 0), Assembler::REX2);
3404       __ jccb(Assembler::notEqual, check_rex_prefix);
3405       __ addptr(rbx, 2);
3406       __ bind(check_rex_prefix);
3407     }
3408     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3409     __ jccb(Assembler::notEqual, no_prefix);
3410     __ addptr(rbx, 1);
3411     __ bind(no_prefix);
3412 #ifdef ASSERT
3413     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3414 #endif
3415     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3416     // r12/rsp 0x04
3417     // r13/rbp 0x05
3418     __ movzbq(rcx, Address(rbx, 1));
3419     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3420     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3421     __ cmpptr(rcx, 1);
3422     __ jccb(Assembler::above, not_special);
3423     __ addptr(rbx, 1);
3424     __ bind(not_special);
3425 #ifdef ASSERT
3426     // Verify the correct encoding of the poll we're about to skip.
3427     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3428     __ jcc(Assembler::notEqual, bail);
3429     // Mask out the modrm bits
3430     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3431     // rax encodes to 0, so if the bits are nonzero it's incorrect
3432     __ jcc(Assembler::notZero, bail);
3433 #endif
3434     // Adjust return pc forward to step over the safepoint poll instruction
3435     __ addptr(rbx, 2);
3436     __ movptr(Address(rbp, wordSize), rbx);
3437   }
3438 
3439   __ bind(no_adjust);
3440   // Normal exit, restore registers and exit.
3441   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3442   __ ret(0);
3443 
3444 #ifdef ASSERT
3445   __ bind(bail);
3446   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3447 #endif
3448 
3449   // Make sure all code is generated
3450   masm->flush();
3451 
3452   // Fill-out other meta info
3453   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3454 }
3455 
3456 //
3457 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3458 //
3459 // Generate a stub that calls into vm to find out the proper destination
3460 // of a java call. All the argument registers are live at this point
3461 // but since this is generic code we don't know what they are and the caller
3462 // must do any gc of the args.
3463 //
3464 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) {
3465   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3466   assert(is_resolve_id(id), "expected a resolve stub id");
3467 
3468   // allocate space for the code
3469   ResourceMark rm;
3470 
3471   const char* name = SharedRuntime::stub_name(id);
3472   CodeBuffer buffer(name, 1552, 512);
3473   MacroAssembler* masm = new MacroAssembler(&buffer);
3474 
3475   int frame_size_in_words;
3476 
3477   OopMapSet *oop_maps = new OopMapSet();
3478   OopMap* map = nullptr;
3479 
3480   int start = __ offset();
3481 
3482   // No need to save vector registers since they are caller-saved anyway.
3483   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3484 
3485   int frame_complete = __ offset();
3486 
3487   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3488 
3489   __ mov(c_rarg0, r15_thread);
3490 
3491   __ call(RuntimeAddress(destination));
3492 
3493 
3494   // Set an oopmap for the call site.
3495   // We need this not only for callee-saved registers, but also for volatile
3496   // registers that the compiler might be keeping live across a safepoint.
3497 
3498   oop_maps->add_gc_map( __ offset() - start, map);
3499 
3500   // rax contains the address we are going to jump to assuming no exception got installed
3501 
3502   // clear last_Java_sp
3503   __ reset_last_Java_frame(false);
3504   // check for pending exceptions
3505   Label pending;
3506   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3507   __ jcc(Assembler::notEqual, pending);
3508 
3509   // get the returned Method*
3510   __ get_vm_result_metadata(rbx);
3511   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3512 
3513   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3514 
3515   RegisterSaver::restore_live_registers(masm);
3516 
3517   // We are back to the original state on entry and ready to go.
3518 
3519   __ jmp(rax);
3520 
3521   // Pending exception after the safepoint
3522 
3523   __ bind(pending);
3524 
3525   RegisterSaver::restore_live_registers(masm);
3526 
3527   // exception pending => remove activation and forward to exception handler
3528 
3529   __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3530 
3531   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3532   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3533 
3534   // -------------
3535   // make sure all code is generated
3536   masm->flush();
3537 
3538   // return the  blob
3539   // frame_size_words or bytes??
3540   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3541 }
3542 
3543 // Continuation point for throwing of implicit exceptions that are
3544 // not handled in the current activation. Fabricates an exception
3545 // oop and initiates normal exception dispatching in this
3546 // frame. Since we need to preserve callee-saved values (currently
3547 // only for C2, but done for C1 as well) we need a callee-saved oop
3548 // map and therefore have to make these stubs into RuntimeStubs
3549 // rather than BufferBlobs.  If the compiler needs all registers to
3550 // be preserved between the fault point and the exception handler
3551 // then it must assume responsibility for that in
3552 // AbstractCompiler::continuation_for_implicit_null_exception or
3553 // continuation_for_implicit_division_by_zero_exception. All other
3554 // implicit exceptions (e.g., NullPointerException or
3555 // AbstractMethodError on entry) are either at call sites or
3556 // otherwise assume that stack unwinding will be initiated, so
3557 // caller saved registers were assumed volatile in the compiler.
3558 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) {
3559   assert(is_throw_id(id), "expected a throw stub id");
3560 
3561   const char* name = SharedRuntime::stub_name(id);
3562 
3563   // Information about frame layout at time of blocking runtime call.
3564   // Note that we only have to preserve callee-saved registers since
3565   // the compilers are responsible for supplying a continuation point
3566   // if they expect all registers to be preserved.
3567   enum layout {
3568     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3569     rbp_off2,
3570     return_off,
3571     return_off2,
3572     framesize // inclusive of return address
3573   };
3574 
3575   int insts_size = 512;
3576   int locs_size  = 64;
3577 
3578   ResourceMark rm;
3579   const char* timer_msg = "SharedRuntime generate_throw_exception";
3580   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3581 
3582   CodeBuffer code(name, insts_size, locs_size);
3583   OopMapSet* oop_maps  = new OopMapSet();
3584   MacroAssembler* masm = new MacroAssembler(&code);
3585 
3586   address start = __ pc();
3587 
3588   // This is an inlined and slightly modified version of call_VM
3589   // which has the ability to fetch the return PC out of
3590   // thread-local storage and also sets up last_Java_sp slightly
3591   // differently than the real call_VM
3592 
3593   __ enter(); // required for proper stackwalking of RuntimeStub frame
3594 
3595   assert(is_even(framesize/2), "sp not 16-byte aligned");
3596 
3597   // return address and rbp are already in place
3598   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3599 
3600   int frame_complete = __ pc() - start;
3601 
3602   // Set up last_Java_sp and last_Java_fp
3603   address the_pc = __ pc();
3604   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3605   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3606 
3607   // Call runtime
3608   __ movptr(c_rarg0, r15_thread);
3609   BLOCK_COMMENT("call runtime_entry");
3610   __ call(RuntimeAddress(runtime_entry));
3611 
3612   // Generate oop map
3613   OopMap* map = new OopMap(framesize, 0);
3614 
3615   oop_maps->add_gc_map(the_pc - start, map);
3616 
3617   __ reset_last_Java_frame(true);
3618 
3619   __ leave(); // required for proper stackwalking of RuntimeStub frame
3620 
3621   // check for pending exceptions
3622 #ifdef ASSERT
3623   Label L;
3624   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3625   __ jcc(Assembler::notEqual, L);
3626   __ should_not_reach_here();
3627   __ bind(L);
3628 #endif // ASSERT
3629   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3630 
3631 
3632   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3633   RuntimeStub* stub =
3634     RuntimeStub::new_runtime_stub(name,
3635                                   &code,
3636                                   frame_complete,
3637                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3638                                   oop_maps, false);
3639   return stub;
3640 }
3641 
3642 //------------------------------Montgomery multiplication------------------------
3643 //
3644 
3645 #ifndef _WINDOWS
3646 
3647 // Subtract 0:b from carry:a.  Return carry.
3648 static julong
3649 sub(julong a[], julong b[], julong carry, long len) {
3650   long long i = 0, cnt = len;
3651   julong tmp;
3652   asm volatile("clc; "
3653                "0: ; "
3654                "mov (%[b], %[i], 8), %[tmp]; "
3655                "sbb %[tmp], (%[a], %[i], 8); "
3656                "inc %[i]; dec %[cnt]; "
3657                "jne 0b; "
3658                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3659                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3660                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3661                : "memory");
3662   return tmp;
3663 }
3664 
3665 // Multiply (unsigned) Long A by Long B, accumulating the double-
3666 // length result into the accumulator formed of T0, T1, and T2.
3667 #define MACC(A, B, T0, T1, T2)                                  \
3668 do {                                                            \
3669   unsigned long hi, lo;                                         \
3670   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3671            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3672            : "r"(A), "a"(B) : "cc");                            \
3673  } while(0)
3674 
3675 // As above, but add twice the double-length result into the
3676 // accumulator.
3677 #define MACC2(A, B, T0, T1, T2)                                 \
3678 do {                                                            \
3679   unsigned long hi, lo;                                         \
3680   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3681            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3682            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3683            : "r"(A), "a"(B) : "cc");                            \
3684  } while(0)
3685 
3686 #else //_WINDOWS
3687 
3688 static julong
3689 sub(julong a[], julong b[], julong carry, long len) {
3690   long i;
3691   julong tmp;
3692   unsigned char c = 1;
3693   for (i = 0; i < len; i++) {
3694     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3695     a[i] = tmp;
3696   }
3697   c = _addcarry_u64(c, carry, ~0, &tmp);
3698   return tmp;
3699 }
3700 
3701 // Multiply (unsigned) Long A by Long B, accumulating the double-
3702 // length result into the accumulator formed of T0, T1, and T2.
3703 #define MACC(A, B, T0, T1, T2)                          \
3704 do {                                                    \
3705   julong hi, lo;                            \
3706   lo = _umul128(A, B, &hi);                             \
3707   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3708   c = _addcarry_u64(c, hi, T1, &T1);                    \
3709   _addcarry_u64(c, T2, 0, &T2);                         \
3710  } while(0)
3711 
3712 // As above, but add twice the double-length result into the
3713 // accumulator.
3714 #define MACC2(A, B, T0, T1, T2)                         \
3715 do {                                                    \
3716   julong hi, lo;                            \
3717   lo = _umul128(A, B, &hi);                             \
3718   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3719   c = _addcarry_u64(c, hi, T1, &T1);                    \
3720   _addcarry_u64(c, T2, 0, &T2);                         \
3721   c = _addcarry_u64(0, lo, T0, &T0);                    \
3722   c = _addcarry_u64(c, hi, T1, &T1);                    \
3723   _addcarry_u64(c, T2, 0, &T2);                         \
3724  } while(0)
3725 
3726 #endif //_WINDOWS
3727 
3728 // Fast Montgomery multiplication.  The derivation of the algorithm is
3729 // in  A Cryptographic Library for the Motorola DSP56000,
3730 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3731 
3732 static void NOINLINE
3733 montgomery_multiply(julong a[], julong b[], julong n[],
3734                     julong m[], julong inv, int len) {
3735   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3736   int i;
3737 
3738   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3739 
3740   for (i = 0; i < len; i++) {
3741     int j;
3742     for (j = 0; j < i; j++) {
3743       MACC(a[j], b[i-j], t0, t1, t2);
3744       MACC(m[j], n[i-j], t0, t1, t2);
3745     }
3746     MACC(a[i], b[0], t0, t1, t2);
3747     m[i] = t0 * inv;
3748     MACC(m[i], n[0], t0, t1, t2);
3749 
3750     assert(t0 == 0, "broken Montgomery multiply");
3751 
3752     t0 = t1; t1 = t2; t2 = 0;
3753   }
3754 
3755   for (i = len; i < 2*len; i++) {
3756     int j;
3757     for (j = i-len+1; j < len; j++) {
3758       MACC(a[j], b[i-j], t0, t1, t2);
3759       MACC(m[j], n[i-j], t0, t1, t2);
3760     }
3761     m[i-len] = t0;
3762     t0 = t1; t1 = t2; t2 = 0;
3763   }
3764 
3765   while (t0)
3766     t0 = sub(m, n, t0, len);
3767 }
3768 
3769 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3770 // multiplies so it should be up to 25% faster than Montgomery
3771 // multiplication.  However, its loop control is more complex and it
3772 // may actually run slower on some machines.
3773 
3774 static void NOINLINE
3775 montgomery_square(julong a[], julong n[],
3776                   julong m[], julong inv, int len) {
3777   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3778   int i;
3779 
3780   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3781 
3782   for (i = 0; i < len; i++) {
3783     int j;
3784     int end = (i+1)/2;
3785     for (j = 0; j < end; j++) {
3786       MACC2(a[j], a[i-j], t0, t1, t2);
3787       MACC(m[j], n[i-j], t0, t1, t2);
3788     }
3789     if ((i & 1) == 0) {
3790       MACC(a[j], a[j], t0, t1, t2);
3791     }
3792     for (; j < i; j++) {
3793       MACC(m[j], n[i-j], t0, t1, t2);
3794     }
3795     m[i] = t0 * inv;
3796     MACC(m[i], n[0], t0, t1, t2);
3797 
3798     assert(t0 == 0, "broken Montgomery square");
3799 
3800     t0 = t1; t1 = t2; t2 = 0;
3801   }
3802 
3803   for (i = len; i < 2*len; i++) {
3804     int start = i-len+1;
3805     int end = start + (len - start)/2;
3806     int j;
3807     for (j = start; j < end; j++) {
3808       MACC2(a[j], a[i-j], t0, t1, t2);
3809       MACC(m[j], n[i-j], t0, t1, t2);
3810     }
3811     if ((i & 1) == 0) {
3812       MACC(a[j], a[j], t0, t1, t2);
3813     }
3814     for (; j < len; j++) {
3815       MACC(m[j], n[i-j], t0, t1, t2);
3816     }
3817     m[i-len] = t0;
3818     t0 = t1; t1 = t2; t2 = 0;
3819   }
3820 
3821   while (t0)
3822     t0 = sub(m, n, t0, len);
3823 }
3824 
3825 // Swap words in a longword.
3826 static julong swap(julong x) {
3827   return (x << 32) | (x >> 32);
3828 }
3829 
3830 // Copy len longwords from s to d, word-swapping as we go.  The
3831 // destination array is reversed.
3832 static void reverse_words(julong *s, julong *d, int len) {
3833   d += len;
3834   while(len-- > 0) {
3835     d--;
3836     *d = swap(*s);
3837     s++;
3838   }
3839 }
3840 
3841 // The threshold at which squaring is advantageous was determined
3842 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3843 #define MONTGOMERY_SQUARING_THRESHOLD 64
3844 
3845 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3846                                         jint len, jlong inv,
3847                                         jint *m_ints) {
3848   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3849   int longwords = len/2;
3850 
3851   // Make very sure we don't use so much space that the stack might
3852   // overflow.  512 jints corresponds to an 16384-bit integer and
3853   // will use here a total of 8k bytes of stack space.
3854   int divisor = sizeof(julong) * 4;
3855   guarantee(longwords <= 8192 / divisor, "must be");
3856   int total_allocation = longwords * sizeof (julong) * 4;
3857   julong *scratch = (julong *)alloca(total_allocation);
3858 
3859   // Local scratch arrays
3860   julong
3861     *a = scratch + 0 * longwords,
3862     *b = scratch + 1 * longwords,
3863     *n = scratch + 2 * longwords,
3864     *m = scratch + 3 * longwords;
3865 
3866   reverse_words((julong *)a_ints, a, longwords);
3867   reverse_words((julong *)b_ints, b, longwords);
3868   reverse_words((julong *)n_ints, n, longwords);
3869 
3870   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3871 
3872   reverse_words(m, (julong *)m_ints, longwords);
3873 }
3874 
3875 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3876                                       jint len, jlong inv,
3877                                       jint *m_ints) {
3878   assert(len % 2 == 0, "array length in montgomery_square must be even");
3879   int longwords = len/2;
3880 
3881   // Make very sure we don't use so much space that the stack might
3882   // overflow.  512 jints corresponds to an 16384-bit integer and
3883   // will use here a total of 6k bytes of stack space.
3884   int divisor = sizeof(julong) * 3;
3885   guarantee(longwords <= (8192 / divisor), "must be");
3886   int total_allocation = longwords * sizeof (julong) * 3;
3887   julong *scratch = (julong *)alloca(total_allocation);
3888 
3889   // Local scratch arrays
3890   julong
3891     *a = scratch + 0 * longwords,
3892     *n = scratch + 1 * longwords,
3893     *m = scratch + 2 * longwords;
3894 
3895   reverse_words((julong *)a_ints, a, longwords);
3896   reverse_words((julong *)n_ints, n, longwords);
3897 
3898   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3899     ::montgomery_square(a, n, m, (julong)inv, longwords);
3900   } else {
3901     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3902   }
3903 
3904   reverse_words(m, (julong *)m_ints, longwords);
3905 }
3906 
3907 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3908   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3909   CodeBuffer buffer(buf);
3910   short buffer_locs[20];
3911   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3912                                          sizeof(buffer_locs)/sizeof(relocInfo));
3913 
3914   MacroAssembler* masm = new MacroAssembler(&buffer);
3915 
3916   const Array<SigEntry>* sig_vk = vk->extended_sig();
3917   const Array<VMRegPair>* regs = vk->return_regs();
3918 
3919   int pack_fields_jobject_off = __ offset();
3920   // Resolve pre-allocated buffer from JNI handle.
3921   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3922   __ movptr(rax, Address(r13, 0));
3923   __ resolve_jobject(rax /* value */,
3924                      r12 /* tmp */);
3925   __ movptr(Address(r13, 0), rax);
3926 
3927   int pack_fields_off = __ offset();
3928 
3929   int j = 1;
3930   for (int i = 0; i < sig_vk->length(); i++) {
3931     BasicType bt = sig_vk->at(i)._bt;
3932     if (bt == T_METADATA) {
3933       continue;
3934     }
3935     if (bt == T_VOID) {
3936       if (sig_vk->at(i-1)._bt == T_LONG ||
3937           sig_vk->at(i-1)._bt == T_DOUBLE) {
3938         j++;
3939       }
3940       continue;
3941     }
3942     int off = sig_vk->at(i)._offset;
3943     assert(off > 0, "offset in object should be positive");
3944     VMRegPair pair = regs->at(j);
3945     VMReg r_1 = pair.first();
3946     VMReg r_2 = pair.second();
3947     Address to(rax, off);
3948     if (bt == T_FLOAT) {
3949       __ movflt(to, r_1->as_XMMRegister());
3950     } else if (bt == T_DOUBLE) {
3951       __ movdbl(to, r_1->as_XMMRegister());
3952     } else {
3953       Register val = r_1->as_Register();
3954       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
3955       if (is_reference_type(bt)) {
3956         __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
3957       } else {
3958         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
3959       }
3960     }
3961     j++;
3962   }
3963   assert(j == regs->length(), "missed a field?");
3964   if (vk->has_nullable_atomic_layout()) {
3965     // Set the null marker
3966     __ movb(Address(rax, vk->null_marker_offset()), 1);
3967   }
3968   __ ret(0);
3969 
3970   int unpack_fields_off = __ offset();
3971 
3972   Label skip;
3973   __ testptr(rax, rax);
3974   __ jcc(Assembler::zero, skip);
3975 
3976   j = 1;
3977   for (int i = 0; i < sig_vk->length(); i++) {
3978     BasicType bt = sig_vk->at(i)._bt;
3979     if (bt == T_METADATA) {
3980       continue;
3981     }
3982     if (bt == T_VOID) {
3983       if (sig_vk->at(i-1)._bt == T_LONG ||
3984           sig_vk->at(i-1)._bt == T_DOUBLE) {
3985         j++;
3986       }
3987       continue;
3988     }
3989     int off = sig_vk->at(i)._offset;
3990     assert(off > 0, "offset in object should be positive");
3991     VMRegPair pair = regs->at(j);
3992     VMReg r_1 = pair.first();
3993     VMReg r_2 = pair.second();
3994     Address from(rax, off);
3995     if (bt == T_FLOAT) {
3996       __ movflt(r_1->as_XMMRegister(), from);
3997     } else if (bt == T_DOUBLE) {
3998       __ movdbl(r_1->as_XMMRegister(), from);
3999     } else if (bt == T_OBJECT || bt == T_ARRAY) {
4000       assert_different_registers(rax, r_1->as_Register());
4001       __ load_heap_oop(r_1->as_Register(), from);
4002     } else {
4003       assert(is_java_primitive(bt), "unexpected basic type");
4004       assert_different_registers(rax, r_1->as_Register());
4005       size_t size_in_bytes = type2aelembytes(bt);
4006       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
4007     }
4008     j++;
4009   }
4010   assert(j == regs->length(), "missed a field?");
4011 
4012   __ bind(skip);
4013   __ ret(0);
4014 
4015   __ flush();
4016 
4017   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
4018 }
4019 
4020 #if INCLUDE_JFR
4021 
4022 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
4023 // It returns a jobject handle to the event writer.
4024 // The handle is dereferenced and the return value is the event writer oop.
4025 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
4026   enum layout {
4027     rbp_off,
4028     rbpH_off,
4029     return_off,
4030     return_off2,
4031     framesize // inclusive of return address
4032   };
4033 
4034   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id);
4035   CodeBuffer code(name, 1024, 64);
4036   MacroAssembler* masm = new MacroAssembler(&code);
4037   address start = __ pc();
4038 
4039   __ enter();
4040   address the_pc = __ pc();
4041 
4042   int frame_complete = the_pc - start;
4043 
4044   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
4045   __ movptr(c_rarg0, r15_thread);
4046   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
4047   __ reset_last_Java_frame(true);
4048 
4049   // rax is jobject handle result, unpack and process it through a barrier.
4050   __ resolve_global_jobject(rax, c_rarg0);
4051 
4052   __ leave();
4053   __ ret(0);
4054 
4055   OopMapSet* oop_maps = new OopMapSet();
4056   OopMap* map = new OopMap(framesize, 1);
4057   oop_maps->add_gc_map(frame_complete, map);
4058 
4059   RuntimeStub* stub =
4060     RuntimeStub::new_runtime_stub(name,
4061                                   &code,
4062                                   frame_complete,
4063                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4064                                   oop_maps,
4065                                   false);
4066   return stub;
4067 }
4068 
4069 // For c2: call to return a leased buffer.
4070 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
4071   enum layout {
4072     rbp_off,
4073     rbpH_off,
4074     return_off,
4075     return_off2,
4076     framesize // inclusive of return address
4077   };
4078 
4079   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id);
4080   CodeBuffer code(name, 1024, 64);
4081   MacroAssembler* masm = new MacroAssembler(&code);
4082   address start = __ pc();
4083 
4084   __ enter();
4085   address the_pc = __ pc();
4086 
4087   int frame_complete = the_pc - start;
4088 
4089   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
4090   __ movptr(c_rarg0, r15_thread);
4091   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
4092   __ reset_last_Java_frame(true);
4093 
4094   __ leave();
4095   __ ret(0);
4096 
4097   OopMapSet* oop_maps = new OopMapSet();
4098   OopMap* map = new OopMap(framesize, 1);
4099   oop_maps->add_gc_map(frame_complete, map);
4100 
4101   RuntimeStub* stub =
4102     RuntimeStub::new_runtime_stub(name,
4103                                   &code,
4104                                   frame_complete,
4105                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4106                                   oop_maps,
4107                                   false);
4108   return stub;
4109 }
4110 
4111 #endif // INCLUDE_JFR