1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "classfile/symbolTable.hpp"
  32 #include "code/compiledIC.hpp"
  33 #include "code/debugInfoRec.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "oops/method.inline.hpp"
  47 #include "prims/methodHandles.hpp"
  48 #include "runtime/continuation.hpp"
  49 #include "runtime/continuationEntry.inline.hpp"
  50 #include "runtime/globals.hpp"
  51 #include "runtime/jniHandles.hpp"
  52 #include "runtime/safepointMechanism.hpp"
  53 #include "runtime/sharedRuntime.hpp"
  54 #include "runtime/signature.hpp"
  55 #include "runtime/stubRoutines.hpp"
  56 #include "runtime/timerTrace.hpp"
  57 #include "runtime/vframeArray.hpp"
  58 #include "runtime/vm_version.hpp"
  59 #include "utilities/align.hpp"
  60 #include "utilities/checkedCast.hpp"
  61 #include "utilities/formatBuffer.hpp"
  62 #include "vmreg_x86.inline.hpp"
  63 #ifdef COMPILER1
  64 #include "c1/c1_Runtime1.hpp"
  65 #endif
  66 #ifdef COMPILER2
  67 #include "opto/runtime.hpp"
  68 #endif
  69 #if INCLUDE_JVMCI
  70 #include "jvmci/jvmciJavaClasses.hpp"
  71 #endif
  72 
  73 #define __ masm->
  74 
  75 #ifdef PRODUCT
  76 #define BLOCK_COMMENT(str) /* nothing */
  77 #else
  78 #define BLOCK_COMMENT(str) __ block_comment(str)
  79 #endif // PRODUCT
  80 
  81 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  82 
  83 class RegisterSaver {
  84   // Capture info about frame layout.  Layout offsets are in jint
  85   // units because compiler frame slots are jints.
  86 #define XSAVE_AREA_BEGIN 160
  87 #define XSAVE_AREA_YMM_BEGIN 576
  88 #define XSAVE_AREA_EGPRS 960
  89 #define XSAVE_AREA_OPMASK_BEGIN 1088
  90 #define XSAVE_AREA_ZMM_BEGIN 1152
  91 #define XSAVE_AREA_UPPERBANK 1664
  92 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  93 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  94 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  95 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  96 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  97   enum layout {
  98     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  99     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
 100     DEF_XMM_OFFS(0),
 101     DEF_XMM_OFFS(1),
 102     // 2..15 are implied in range usage
 103     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 104     DEF_YMM_OFFS(0),
 105     DEF_YMM_OFFS(1),
 106     // 2..15 are implied in range usage
 107     r31_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 108     r31H_off,
 109     r30_off, r30H_off,
 110     r29_off, r29H_off,
 111     r28_off, r28H_off,
 112     r27_off, r27H_off,
 113     r26_off, r26H_off,
 114     r25_off, r25H_off,
 115     r24_off, r24H_off,
 116     r23_off, r23H_off,
 117     r22_off, r22H_off,
 118     r21_off, r21H_off,
 119     r20_off, r20H_off,
 120     r19_off, r19H_off,
 121     r18_off, r18H_off,
 122     r17_off, r17H_off,
 123     r16_off, r16H_off,
 124     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 125     DEF_OPMASK_OFFS(0),
 126     DEF_OPMASK_OFFS(1),
 127     // 2..7 are implied in range usage
 128     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 129     DEF_ZMM_OFFS(0),
 130     DEF_ZMM_OFFS(1),
 131     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 132     DEF_ZMM_UPPER_OFFS(16),
 133     DEF_ZMM_UPPER_OFFS(17),
 134     // 18..31 are implied in range usage
 135     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 136     fpu_stateH_end,
 137     r15_off, r15H_off,
 138     r14_off, r14H_off,
 139     r13_off, r13H_off,
 140     r12_off, r12H_off,
 141     r11_off, r11H_off,
 142     r10_off, r10H_off,
 143     r9_off,  r9H_off,
 144     r8_off,  r8H_off,
 145     rdi_off, rdiH_off,
 146     rsi_off, rsiH_off,
 147     ignore_off, ignoreH_off,  // extra copy of rbp
 148     rsp_off, rspH_off,
 149     rbx_off, rbxH_off,
 150     rdx_off, rdxH_off,
 151     rcx_off, rcxH_off,
 152     rax_off, raxH_off,
 153     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 154     align_off, alignH_off,
 155     flags_off, flagsH_off,
 156     // The frame sender code expects that rbp will be in the "natural" place and
 157     // will override any oopMap setting for it. We must therefore force the layout
 158     // so that it agrees with the frame sender code.
 159     rbp_off, rbpH_off,        // copy of rbp we will restore
 160     return_off, returnH_off,  // slot for return address
 161     reg_save_size             // size in compiler stack slots
 162   };
 163 
 164  public:
 165   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 166   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 167 
 168   // Offsets into the register save area
 169   // Used by deoptimization when it is managing result register
 170   // values on its own
 171 
 172   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 173   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 174   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 175   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 176   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 177 
 178   // During deoptimization only the result registers need to be restored,
 179   // all the other values have already been extracted.
 180   static void restore_result_registers(MacroAssembler* masm);
 181 };
 182 
 183 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 184   int off = 0;
 185   int num_xmm_regs = XMMRegister::available_xmm_registers();
 186 #if COMPILER2_OR_JVMCI
 187   if (save_wide_vectors && UseAVX == 0) {
 188     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 189   }
 190   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 191 #else
 192   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 193 #endif
 194 
 195   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 196   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 197   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 198   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 199   // CodeBlob frame size is in words.
 200   int frame_size_in_words = frame_size_in_bytes / wordSize;
 201   *total_frame_words = frame_size_in_words;
 202 
 203   // Save registers, fpu state, and flags.
 204   // We assume caller has already pushed the return address onto the
 205   // stack, so rsp is 8-byte aligned here.
 206   // We push rpb twice in this sequence because we want the real rbp
 207   // to be under the return like a normal enter.
 208 
 209   __ enter();          // rsp becomes 16-byte aligned here
 210   __ pushf();
 211   // Make sure rsp stays 16-byte aligned
 212   __ subq(rsp, 8);
 213   // Push CPU state in multiple of 16 bytes
 214   __ save_legacy_gprs();
 215   __ push_FPU_state();
 216 
 217 
 218   // push cpu state handles this on EVEX enabled targets
 219   if (save_wide_vectors) {
 220     // Save upper half of YMM registers(0..15)
 221     int base_addr = XSAVE_AREA_YMM_BEGIN;
 222     for (int n = 0; n < 16; n++) {
 223       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 224     }
 225     if (VM_Version::supports_evex()) {
 226       // Save upper half of ZMM registers(0..15)
 227       base_addr = XSAVE_AREA_ZMM_BEGIN;
 228       for (int n = 0; n < 16; n++) {
 229         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 230       }
 231       // Save full ZMM registers(16..num_xmm_regs)
 232       base_addr = XSAVE_AREA_UPPERBANK;
 233       off = 0;
 234       int vector_len = Assembler::AVX_512bit;
 235       for (int n = 16; n < num_xmm_regs; n++) {
 236         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 237       }
 238 #if COMPILER2_OR_JVMCI
 239       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 240       off = 0;
 241       for(int n = 0; n < KRegister::number_of_registers; n++) {
 242         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 243       }
 244 #endif
 245     }
 246   } else {
 247     if (VM_Version::supports_evex()) {
 248       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 249       int base_addr = XSAVE_AREA_UPPERBANK;
 250       off = 0;
 251       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 252       for (int n = 16; n < num_xmm_regs; n++) {
 253         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 254       }
 255 #if COMPILER2_OR_JVMCI
 256       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 257       off = 0;
 258       for(int n = 0; n < KRegister::number_of_registers; n++) {
 259         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 260       }
 261 #endif
 262     }
 263   }
 264 
 265 #if COMPILER2_OR_JVMCI
 266   if (UseAPX) {
 267       int base_addr = XSAVE_AREA_EGPRS;
 268       off = 0;
 269       for(int n = 16; n < Register::number_of_registers; n++) {
 270         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 271       }
 272   }
 273 #endif
 274 
 275   __ vzeroupper();
 276   if (frame::arg_reg_save_area_bytes != 0) {
 277     // Allocate argument register save area
 278     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 279   }
 280 
 281   // Set an oopmap for the call site.  This oopmap will map all
 282   // oop-registers and debug-info registers as callee-saved.  This
 283   // will allow deoptimization at this safepoint to find all possible
 284   // debug-info recordings, as well as let GC find all oops.
 285 
 286   OopMapSet *oop_maps = new OopMapSet();
 287   OopMap* map = new OopMap(frame_size_in_slots, 0);
 288 
 289 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 290 
 291   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 294   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 295   // rbp location is known implicitly by the frame sender code, needs no oopmap
 296   // and the location where rbp was saved by is ignored
 297   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 306   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 307 
 308   if (UseAPX) {
 309     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 324     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 325   }
 326   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 327   // on EVEX enabled targets, we get it included in the xsave area
 328   off = xmm0_off;
 329   int delta = xmm1_off - off;
 330   for (int n = 0; n < 16; n++) {
 331     XMMRegister xmm_name = as_XMMRegister(n);
 332     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 333     off += delta;
 334   }
 335   if (UseAVX > 2) {
 336     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 337     off = zmm16_off;
 338     delta = zmm17_off - off;
 339     for (int n = 16; n < num_xmm_regs; n++) {
 340       XMMRegister zmm_name = as_XMMRegister(n);
 341       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 342       off += delta;
 343     }
 344   }
 345 
 346 #if COMPILER2_OR_JVMCI
 347   if (save_wide_vectors) {
 348     // Save upper half of YMM registers(0..15)
 349     off = ymm0_off;
 350     delta = ymm1_off - ymm0_off;
 351     for (int n = 0; n < 16; n++) {
 352       XMMRegister ymm_name = as_XMMRegister(n);
 353       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 354       off += delta;
 355     }
 356     if (VM_Version::supports_evex()) {
 357       // Save upper half of ZMM registers(0..15)
 358       off = zmm0_off;
 359       delta = zmm1_off - zmm0_off;
 360       for (int n = 0; n < 16; n++) {
 361         XMMRegister zmm_name = as_XMMRegister(n);
 362         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 363         off += delta;
 364       }
 365     }
 366   }
 367 #endif // COMPILER2_OR_JVMCI
 368 
 369   // %%% These should all be a waste but we'll keep things as they were for now
 370   if (true) {
 371     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 374     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 375     // rbp location is known implicitly by the frame sender code, needs no oopmap
 376     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 385     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 386     if (UseAPX) {
 387       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 402       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 403     }
 404     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 405     // on EVEX enabled targets, we get it included in the xsave area
 406     off = xmm0H_off;
 407     delta = xmm1H_off - off;
 408     for (int n = 0; n < 16; n++) {
 409       XMMRegister xmm_name = as_XMMRegister(n);
 410       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 411       off += delta;
 412     }
 413     if (UseAVX > 2) {
 414       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 415       off = zmm16H_off;
 416       delta = zmm17H_off - off;
 417       for (int n = 16; n < num_xmm_regs; n++) {
 418         XMMRegister zmm_name = as_XMMRegister(n);
 419         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 420         off += delta;
 421       }
 422     }
 423   }
 424 
 425   return map;
 426 }
 427 
 428 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 429   int num_xmm_regs = XMMRegister::available_xmm_registers();
 430   if (frame::arg_reg_save_area_bytes != 0) {
 431     // Pop arg register save area
 432     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 433   }
 434 
 435 #if COMPILER2_OR_JVMCI
 436   if (restore_wide_vectors) {
 437     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 438     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 439   }
 440 #else
 441   assert(!restore_wide_vectors, "vectors are generated only by C2");
 442 #endif
 443 
 444   __ vzeroupper();
 445 
 446   // On EVEX enabled targets everything is handled in pop fpu state
 447   if (restore_wide_vectors) {
 448     // Restore upper half of YMM registers (0..15)
 449     int base_addr = XSAVE_AREA_YMM_BEGIN;
 450     for (int n = 0; n < 16; n++) {
 451       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 452     }
 453     if (VM_Version::supports_evex()) {
 454       // Restore upper half of ZMM registers (0..15)
 455       base_addr = XSAVE_AREA_ZMM_BEGIN;
 456       for (int n = 0; n < 16; n++) {
 457         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 458       }
 459       // Restore full ZMM registers(16..num_xmm_regs)
 460       base_addr = XSAVE_AREA_UPPERBANK;
 461       int vector_len = Assembler::AVX_512bit;
 462       int off = 0;
 463       for (int n = 16; n < num_xmm_regs; n++) {
 464         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 465       }
 466 #if COMPILER2_OR_JVMCI
 467       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 468       off = 0;
 469       for (int n = 0; n < KRegister::number_of_registers; n++) {
 470         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 471       }
 472 #endif
 473     }
 474   } else {
 475     if (VM_Version::supports_evex()) {
 476       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 477       int base_addr = XSAVE_AREA_UPPERBANK;
 478       int off = 0;
 479       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 480       for (int n = 16; n < num_xmm_regs; n++) {
 481         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 482       }
 483 #if COMPILER2_OR_JVMCI
 484       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 485       off = 0;
 486       for (int n = 0; n < KRegister::number_of_registers; n++) {
 487         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 488       }
 489 #endif
 490     }
 491   }
 492 
 493 #if COMPILER2_OR_JVMCI
 494   if (UseAPX) {
 495     int base_addr = XSAVE_AREA_EGPRS;
 496     int off = 0;
 497     for (int n = 16; n < Register::number_of_registers; n++) {
 498       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 499     }
 500   }
 501 #endif
 502 
 503   // Recover CPU state
 504   __ pop_FPU_state();
 505   __ restore_legacy_gprs();
 506   __ addq(rsp, 8);
 507   __ popf();
 508   // Get the rbp described implicitly by the calling convention (no oopMap)
 509   __ pop(rbp);
 510 }
 511 
 512 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 513 
 514   // Just restore result register. Only used by deoptimization. By
 515   // now any callee save register that needs to be restored to a c2
 516   // caller of the deoptee has been extracted into the vframeArray
 517   // and will be stuffed into the c2i adapter we create for later
 518   // restoration so only result registers need to be restored here.
 519 
 520   // Restore fp result register
 521   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 522   // Restore integer result register
 523   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 524   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 525 
 526   // Pop all of the register save are off the stack except the return address
 527   __ addptr(rsp, return_offset_in_bytes());
 528 }
 529 
 530 // Is vector's size (in bytes) bigger than a size saved by default?
 531 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 532 bool SharedRuntime::is_wide_vector(int size) {
 533   return size > 16;
 534 }
 535 
 536 // ---------------------------------------------------------------------------
 537 // Read the array of BasicTypes from a signature, and compute where the
 538 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 539 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 540 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 541 // as framesizes are fixed.
 542 // VMRegImpl::stack0 refers to the first slot 0(sp).
 543 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 544 // Register up to Register::number_of_registers are the 64-bit
 545 // integer registers.
 546 
 547 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 548 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 549 // units regardless of build. Of course for i486 there is no 64 bit build
 550 
 551 // The Java calling convention is a "shifted" version of the C ABI.
 552 // By skipping the first C ABI register we can call non-static jni methods
 553 // with small numbers of arguments without having to shuffle the arguments
 554 // at all. Since we control the java ABI we ought to at least get some
 555 // advantage out of it.
 556 
 557 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 558                                            VMRegPair *regs,
 559                                            int total_args_passed) {
 560 
 561   // Create the mapping between argument positions and
 562   // registers.
 563   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 564     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 565   };
 566   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 567     j_farg0, j_farg1, j_farg2, j_farg3,
 568     j_farg4, j_farg5, j_farg6, j_farg7
 569   };
 570 
 571 
 572   uint int_args = 0;
 573   uint fp_args = 0;
 574   uint stk_args = 0;
 575 
 576   for (int i = 0; i < total_args_passed; i++) {
 577     switch (sig_bt[i]) {
 578     case T_BOOLEAN:
 579     case T_CHAR:
 580     case T_BYTE:
 581     case T_SHORT:
 582     case T_INT:
 583       if (int_args < Argument::n_int_register_parameters_j) {
 584         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 585       } else {
 586         stk_args = align_up(stk_args, 2);
 587         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 588         stk_args += 1;
 589       }
 590       break;
 591     case T_VOID:
 592       // halves of T_LONG or T_DOUBLE
 593       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 594       regs[i].set_bad();
 595       break;
 596     case T_LONG:
 597       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 598       // fall through
 599     case T_OBJECT:
 600     case T_ARRAY:
 601     case T_ADDRESS:
 602       if (int_args < Argument::n_int_register_parameters_j) {
 603         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 604       } else {
 605         stk_args = align_up(stk_args, 2);
 606         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 607         stk_args += 2;
 608       }
 609       break;
 610     case T_FLOAT:
 611       if (fp_args < Argument::n_float_register_parameters_j) {
 612         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 613       } else {
 614         stk_args = align_up(stk_args, 2);
 615         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 616         stk_args += 1;
 617       }
 618       break;
 619     case T_DOUBLE:
 620       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 621       if (fp_args < Argument::n_float_register_parameters_j) {
 622         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 623       } else {
 624         stk_args = align_up(stk_args, 2);
 625         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 626         stk_args += 2;
 627       }
 628       break;
 629     default:
 630       ShouldNotReachHere();
 631       break;
 632     }
 633   }
 634 
 635   return stk_args;
 636 }
 637 
 638 // Same as java_calling_convention() but for multiple return
 639 // values. There's no way to store them on the stack so if we don't
 640 // have enough registers, multiple values can't be returned.
 641 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 642 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 643 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 644                                           VMRegPair *regs,
 645                                           int total_args_passed) {
 646   // Create the mapping between argument positions and
 647   // registers.
 648   static const Register INT_ArgReg[java_return_convention_max_int] = {
 649     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 650   };
 651   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 652     j_farg0, j_farg1, j_farg2, j_farg3,
 653     j_farg4, j_farg5, j_farg6, j_farg7
 654   };
 655 
 656 
 657   uint int_args = 0;
 658   uint fp_args = 0;
 659 
 660   for (int i = 0; i < total_args_passed; i++) {
 661     switch (sig_bt[i]) {
 662     case T_BOOLEAN:
 663     case T_CHAR:
 664     case T_BYTE:
 665     case T_SHORT:
 666     case T_INT:
 667       if (int_args < Argument::n_int_register_parameters_j+1) {
 668         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 669         int_args++;
 670       } else {
 671         return -1;
 672       }
 673       break;
 674     case T_VOID:
 675       // halves of T_LONG or T_DOUBLE
 676       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 677       regs[i].set_bad();
 678       break;
 679     case T_LONG:
 680       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 681       // fall through
 682     case T_OBJECT:
 683     case T_ARRAY:
 684     case T_ADDRESS:
 685     case T_METADATA:
 686       if (int_args < Argument::n_int_register_parameters_j+1) {
 687         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 688         int_args++;
 689       } else {
 690         return -1;
 691       }
 692       break;
 693     case T_FLOAT:
 694       if (fp_args < Argument::n_float_register_parameters_j) {
 695         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 696         fp_args++;
 697       } else {
 698         return -1;
 699       }
 700       break;
 701     case T_DOUBLE:
 702       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 703       if (fp_args < Argument::n_float_register_parameters_j) {
 704         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 705         fp_args++;
 706       } else {
 707         return -1;
 708       }
 709       break;
 710     default:
 711       ShouldNotReachHere();
 712       break;
 713     }
 714   }
 715 
 716   return int_args + fp_args;
 717 }
 718 
 719 // Patch the callers callsite with entry to compiled code if it exists.
 720 static void patch_callers_callsite(MacroAssembler *masm) {
 721   Label L;
 722   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 723   __ jcc(Assembler::equal, L);
 724 
 725   // Save the current stack pointer
 726   __ mov(r13, rsp);
 727   // Schedule the branch target address early.
 728   // Call into the VM to patch the caller, then jump to compiled callee
 729   // rax isn't live so capture return address while we easily can
 730   __ movptr(rax, Address(rsp, 0));
 731 
 732   // align stack so push_CPU_state doesn't fault
 733   __ andptr(rsp, -(StackAlignmentInBytes));
 734   __ push_CPU_state();
 735   __ vzeroupper();
 736   // VM needs caller's callsite
 737   // VM needs target method
 738   // This needs to be a long call since we will relocate this adapter to
 739   // the codeBuffer and it may not reach
 740 
 741   // Allocate argument register save area
 742   if (frame::arg_reg_save_area_bytes != 0) {
 743     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 744   }
 745   __ mov(c_rarg0, rbx);
 746   __ mov(c_rarg1, rax);
 747   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 748 
 749   // De-allocate argument register save area
 750   if (frame::arg_reg_save_area_bytes != 0) {
 751     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 752   }
 753 
 754   __ vzeroupper();
 755   __ pop_CPU_state();
 756   // restore sp
 757   __ mov(rsp, r13);
 758   __ bind(L);
 759 }
 760 
 761 // For each inline type argument, sig includes the list of fields of
 762 // the inline type. This utility function computes the number of
 763 // arguments for the call if inline types are passed by reference (the
 764 // calling convention the interpreter expects).
 765 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 766   int total_args_passed = 0;
 767   if (InlineTypePassFieldsAsArgs) {
 768     for (int i = 0; i < sig_extended->length(); i++) {
 769       BasicType bt = sig_extended->at(i)._bt;
 770       if (bt == T_METADATA) {
 771         // In sig_extended, an inline type argument starts with:
 772         // T_METADATA, followed by the types of the fields of the
 773         // inline type and T_VOID to mark the end of the value
 774         // type. Inline types are flattened so, for instance, in the
 775         // case of an inline type with an int field and an inline type
 776         // field that itself has 2 fields, an int and a long:
 777         // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
 778         // slot for the T_LONG) T_VOID (inner inline type) T_VOID
 779         // (outer inline type)
 780         total_args_passed++;
 781         int vt = 1;
 782         do {
 783           i++;
 784           BasicType bt = sig_extended->at(i)._bt;
 785           BasicType prev_bt = sig_extended->at(i-1)._bt;
 786           if (bt == T_METADATA) {
 787             vt++;
 788           } else if (bt == T_VOID &&
 789                      prev_bt != T_LONG &&
 790                      prev_bt != T_DOUBLE) {
 791             vt--;
 792           }
 793         } while (vt != 0);
 794       } else {
 795         total_args_passed++;
 796       }
 797     }
 798   } else {
 799     total_args_passed = sig_extended->length();
 800   }
 801   return total_args_passed;
 802 }
 803 
 804 
 805 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 806                                    BasicType bt,
 807                                    BasicType prev_bt,
 808                                    size_t size_in_bytes,
 809                                    const VMRegPair& reg_pair,
 810                                    const Address& to,
 811                                    int extraspace,
 812                                    bool is_oop) {
 813   if (bt == T_VOID) {
 814     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 815     return;
 816   }
 817 
 818   // Say 4 args:
 819   // i   st_off
 820   // 0   32 T_LONG
 821   // 1   24 T_VOID
 822   // 2   16 T_OBJECT
 823   // 3    8 T_BOOL
 824   // -    0 return address
 825   //
 826   // However to make thing extra confusing. Because we can fit a long/double in
 827   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 828   // leaves one slot empty and only stores to a single slot. In this case the
 829   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 830 
 831   bool wide = (size_in_bytes == wordSize);
 832   VMReg r_1 = reg_pair.first();
 833   VMReg r_2 = reg_pair.second();
 834   assert(r_2->is_valid() == wide, "invalid size");
 835   if (!r_1->is_valid()) {
 836     assert(!r_2->is_valid(), "must be invalid");
 837     return;
 838   }
 839 
 840   if (!r_1->is_XMMRegister()) {
 841     Register val = rax;
 842     if (r_1->is_stack()) {
 843       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 844       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 845     } else {
 846       val = r_1->as_Register();
 847     }
 848     assert_different_registers(to.base(), val, rscratch1);
 849     if (is_oop) {
 850       __ push(r13);
 851       __ push(rbx);
 852       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 853       __ pop(rbx);
 854       __ pop(r13);
 855     } else {
 856       __ store_sized_value(to, val, size_in_bytes);
 857     }
 858   } else {
 859     if (wide) {
 860       __ movdbl(to, r_1->as_XMMRegister());
 861     } else {
 862       __ movflt(to, r_1->as_XMMRegister());
 863     }
 864   }
 865 }
 866 
 867 static void gen_c2i_adapter(MacroAssembler *masm,
 868                             const GrowableArray<SigEntry>* sig_extended,
 869                             const VMRegPair *regs,
 870                             bool requires_clinit_barrier,
 871                             address& c2i_no_clinit_check_entry,
 872                             Label& skip_fixup,
 873                             address start,
 874                             OopMapSet* oop_maps,
 875                             int& frame_complete,
 876                             int& frame_size_in_words,
 877                             bool alloc_inline_receiver) {
 878   if (requires_clinit_barrier && VM_Version::supports_fast_class_init_checks()) {
 879     Label L_skip_barrier;
 880     Register method = rbx;
 881 
 882     { // Bypass the barrier for non-static methods
 883       Register flags = rscratch1;
 884       __ movl(flags, Address(method, Method::access_flags_offset()));
 885       __ testl(flags, JVM_ACC_STATIC);
 886       __ jcc(Assembler::zero, L_skip_barrier); // non-static
 887     }
 888 
 889     Register klass = rscratch1;
 890     __ load_method_holder(klass, method);
 891     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
 892 
 893     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
 894 
 895     __ bind(L_skip_barrier);
 896     c2i_no_clinit_check_entry = __ pc();
 897   }
 898 
 899   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 900   bs->c2i_entry_barrier(masm);
 901 
 902   // Before we get into the guts of the C2I adapter, see if we should be here
 903   // at all.  We've come from compiled code and are attempting to jump to the
 904   // interpreter, which means the caller made a static call to get here
 905   // (vcalls always get a compiled target if there is one).  Check for a
 906   // compiled target.  If there is one, we need to patch the caller's call.
 907   patch_callers_callsite(masm);
 908 
 909   __ bind(skip_fixup);
 910 
 911   if (InlineTypePassFieldsAsArgs) {
 912     // Is there an inline type argument?
 913     bool has_inline_argument = false;
 914     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 915       has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
 916     }
 917     if (has_inline_argument) {
 918       // There is at least an inline type argument: we're coming from
 919       // compiled code so we have no buffers to back the inline types.
 920       // Allocate the buffers here with a runtime call.
 921       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
 922 
 923       frame_complete = __ offset();
 924 
 925       __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
 926 
 927       __ mov(c_rarg0, r15_thread);
 928       __ mov(c_rarg1, rbx);
 929       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 930       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 931 
 932       oop_maps->add_gc_map((int)(__ pc() - start), map);
 933       __ reset_last_Java_frame(false);
 934 
 935       RegisterSaver::restore_live_registers(masm);
 936 
 937       Label no_exception;
 938       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 939       __ jcc(Assembler::equal, no_exception);
 940 
 941       __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
 942       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 943       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 944 
 945       __ bind(no_exception);
 946 
 947       // We get an array of objects from the runtime call
 948       __ get_vm_result(rscratch2, r15_thread); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 949       __ get_vm_result_2(rbx, r15_thread); // TODO: required to keep the callee Method live?
 950     }
 951   }
 952 
 953   // Since all args are passed on the stack, total_args_passed *
 954   // Interpreter::stackElementSize is the space we need.
 955   int total_args_passed = compute_total_args_passed_int(sig_extended);
 956   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 957 
 958   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 959 
 960   // stack is aligned, keep it that way
 961   // This is not currently needed or enforced by the interpreter, but
 962   // we might as well conform to the ABI.
 963   extraspace = align_up(extraspace, 2*wordSize);
 964 
 965   // set senderSP value
 966   __ lea(r13, Address(rsp, wordSize));
 967 
 968 #ifdef ASSERT
 969   __ check_stack_alignment(r13, "sender stack not aligned");
 970 #endif
 971   if (extraspace > 0) {
 972     // Pop the return address
 973     __ pop(rax);
 974 
 975     __ subptr(rsp, extraspace);
 976 
 977     // Push the return address
 978     __ push(rax);
 979 
 980     // Account for the return address location since we store it first rather
 981     // than hold it in a register across all the shuffling
 982     extraspace += wordSize;
 983   }
 984 
 985 #ifdef ASSERT
 986   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 987 #endif
 988 
 989   // Now write the args into the outgoing interpreter space
 990 
 991   // next_arg_comp is the next argument from the compiler point of
 992   // view (inline type fields are passed in registers/on the stack). In
 993   // sig_extended, an inline type argument starts with: T_METADATA,
 994   // followed by the types of the fields of the inline type and T_VOID
 995   // to mark the end of the inline type. ignored counts the number of
 996   // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
 997   // used to get the buffer for that argument from the pool of buffers
 998   // we allocated above and want to pass to the
 999   // interpreter. next_arg_int is the next argument from the
1000   // interpreter point of view (inline types are passed by reference).
1001   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
1002        next_arg_comp < sig_extended->length(); next_arg_comp++) {
1003     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
1004     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
1005     BasicType bt = sig_extended->at(next_arg_comp)._bt;
1006     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
1007     if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
1008       int next_off = st_off - Interpreter::stackElementSize;
1009       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
1010       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
1011       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
1012       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1013                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
1014       next_arg_int++;
1015 #ifdef ASSERT
1016       if (bt == T_LONG || bt == T_DOUBLE) {
1017         // Overwrite the unused slot with known junk
1018         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
1019         __ movptr(Address(rsp, st_off), rax);
1020       }
1021 #endif /* ASSERT */
1022     } else {
1023       ignored++;
1024       // get the buffer from the just allocated pool of buffers
1025       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
1026       __ load_heap_oop(r14, Address(rscratch2, index));
1027       next_vt_arg++; next_arg_int++;
1028       int vt = 1;
1029       // write fields we get from compiled code in registers/stack
1030       // slots to the buffer: we know we are done with that inline type
1031       // argument when we hit the T_VOID that acts as an end of inline
1032       // type delimiter for this inline type. Inline types are flattened
1033       // so we might encounter embedded inline types. Each entry in
1034       // sig_extended contains a field offset in the buffer.
1035       Label L_null;
1036       do {
1037         next_arg_comp++;
1038         BasicType bt = sig_extended->at(next_arg_comp)._bt;
1039         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
1040         if (bt == T_METADATA) {
1041           vt++;
1042           ignored++;
1043         } else if (bt == T_VOID &&
1044                    prev_bt != T_LONG &&
1045                    prev_bt != T_DOUBLE) {
1046           vt--;
1047           ignored++;
1048         } else {
1049           int off = sig_extended->at(next_arg_comp)._offset;
1050           if (off == -1) {
1051             // Nullable inline type argument, emit null check
1052             VMReg reg = regs[next_arg_comp-ignored].first();
1053             Label L_notNull;
1054             if (reg->is_stack()) {
1055               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1056               __ testb(Address(rsp, ld_off), 1);
1057             } else {
1058               __ testb(reg->as_Register(), 1);
1059             }
1060             __ jcc(Assembler::notZero, L_notNull);
1061             __ movptr(Address(rsp, st_off), 0);
1062             __ jmp(L_null);
1063             __ bind(L_notNull);
1064             continue;
1065           }
1066           assert(off > 0, "offset in object should be positive");
1067           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
1068           bool is_oop = is_reference_type(bt);
1069           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1070                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
1071         }
1072       } while (vt != 0);
1073       // pass the buffer to the interpreter
1074       __ movptr(Address(rsp, st_off), r14);
1075       __ bind(L_null);
1076     }
1077   }
1078 
1079   // Schedule the branch target address early.
1080   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1081   __ jmp(rcx);
1082 }
1083 
1084 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
1085                         address code_start, address code_end,
1086                         Label& L_ok) {
1087   Label L_fail;
1088   __ lea(temp_reg, AddressLiteral(code_start, relocInfo::none));
1089   __ cmpptr(pc_reg, temp_reg);
1090   __ jcc(Assembler::belowEqual, L_fail);
1091   __ lea(temp_reg, AddressLiteral(code_end, relocInfo::none));
1092   __ cmpptr(pc_reg, temp_reg);
1093   __ jcc(Assembler::below, L_ok);
1094   __ bind(L_fail);
1095 }
1096 
1097 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1098                                     int comp_args_on_stack,
1099                                     const GrowableArray<SigEntry>* sig,
1100                                     const VMRegPair *regs) {
1101 
1102   // Note: r13 contains the senderSP on entry. We must preserve it since
1103   // we may do a i2c -> c2i transition if we lose a race where compiled
1104   // code goes non-entrant while we get args ready.
1105   // In addition we use r13 to locate all the interpreter args as
1106   // we must align the stack to 16 bytes on an i2c entry else we
1107   // lose alignment we expect in all compiled code and register
1108   // save code can segv when fxsave instructions find improperly
1109   // aligned stack pointer.
1110 
1111   // Adapters can be frameless because they do not require the caller
1112   // to perform additional cleanup work, such as correcting the stack pointer.
1113   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1114   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1115   // even if a callee has modified the stack pointer.
1116   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1117   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1118   // up via the senderSP register).
1119   // In other words, if *either* the caller or callee is interpreted, we can
1120   // get the stack pointer repaired after a call.
1121   // This is why c2i and i2c adapters cannot be indefinitely composed.
1122   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1123   // both caller and callee would be compiled methods, and neither would
1124   // clean up the stack pointer changes performed by the two adapters.
1125   // If this happens, control eventually transfers back to the compiled
1126   // caller, but with an uncorrected stack, causing delayed havoc.
1127 
1128   if (VerifyAdapterCalls &&
1129       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
1130     // So, let's test for cascading c2i/i2c adapters right now.
1131     //  assert(Interpreter::contains($return_addr) ||
1132     //         StubRoutines::contains($return_addr),
1133     //         "i2c adapter must return to an interpreter frame");
1134     __ block_comment("verify_i2c { ");
1135     // Pick up the return address
1136     __ movptr(rax, Address(rsp, 0));
1137     Label L_ok;
1138     if (Interpreter::code() != nullptr) {
1139       range_check(masm, rax, r11,
1140                   Interpreter::code()->code_start(),
1141                   Interpreter::code()->code_end(),
1142                   L_ok);
1143     }
1144     if (StubRoutines::initial_stubs_code() != nullptr) {
1145       range_check(masm, rax, r11,
1146                   StubRoutines::initial_stubs_code()->code_begin(),
1147                   StubRoutines::initial_stubs_code()->code_end(),
1148                   L_ok);
1149     }
1150     if (StubRoutines::final_stubs_code() != nullptr) {
1151       range_check(masm, rax, r11,
1152                   StubRoutines::final_stubs_code()->code_begin(),
1153                   StubRoutines::final_stubs_code()->code_end(),
1154                   L_ok);
1155     }
1156     const char* msg = "i2c adapter must return to an interpreter frame";
1157     __ block_comment(msg);
1158     __ stop(msg);
1159     __ bind(L_ok);
1160     __ block_comment("} verify_i2ce ");
1161   }
1162 
1163   // Must preserve original SP for loading incoming arguments because
1164   // we need to align the outgoing SP for compiled code.
1165   __ movptr(r11, rsp);
1166 
1167   // Pick up the return address
1168   __ pop(rax);
1169 
1170   // Convert 4-byte c2 stack slots to words.
1171   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1172 
1173   if (comp_args_on_stack) {
1174     __ subptr(rsp, comp_words_on_stack * wordSize);
1175   }
1176 
1177   // Ensure compiled code always sees stack at proper alignment
1178   __ andptr(rsp, -16);
1179 
1180   // push the return address and misalign the stack that youngest frame always sees
1181   // as far as the placement of the call instruction
1182   __ push(rax);
1183 
1184   // Put saved SP in another register
1185   const Register saved_sp = rax;
1186   __ movptr(saved_sp, r11);
1187 
1188   // Will jump to the compiled code just as if compiled code was doing it.
1189   // Pre-load the register-jump target early, to schedule it better.
1190   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1191 
1192 #if INCLUDE_JVMCI
1193   if (EnableJVMCI) {
1194     // check if this call should be routed towards a specific entry point
1195     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1196     Label no_alternative_target;
1197     __ jcc(Assembler::equal, no_alternative_target);
1198     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1199     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1200     __ bind(no_alternative_target);
1201   }
1202 #endif // INCLUDE_JVMCI
1203 
1204   int total_args_passed = sig->length();
1205 
1206   // Now generate the shuffle code.  Pick up all register args and move the
1207   // rest through the floating point stack top.
1208   for (int i = 0; i < total_args_passed; i++) {
1209     BasicType bt = sig->at(i)._bt;
1210     if (bt == T_VOID) {
1211       // Longs and doubles are passed in native word order, but misaligned
1212       // in the 32-bit build.
1213       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1214       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1215       continue;
1216     }
1217 
1218     // Pick up 0, 1 or 2 words from SP+offset.
1219 
1220     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1221             "scrambled load targets?");
1222     // Load in argument order going down.
1223     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1224     // Point to interpreter value (vs. tag)
1225     int next_off = ld_off - Interpreter::stackElementSize;
1226     //
1227     //
1228     //
1229     VMReg r_1 = regs[i].first();
1230     VMReg r_2 = regs[i].second();
1231     if (!r_1->is_valid()) {
1232       assert(!r_2->is_valid(), "");
1233       continue;
1234     }
1235     if (r_1->is_stack()) {
1236       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1237       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1238 
1239       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1240       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1241       // will be generated.
1242       if (!r_2->is_valid()) {
1243         // sign extend???
1244         __ movl(r13, Address(saved_sp, ld_off));
1245         __ movptr(Address(rsp, st_off), r13);
1246       } else {
1247         //
1248         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1249         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1250         // So we must adjust where to pick up the data to match the interpreter.
1251         //
1252         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1253         // are accessed as negative so LSW is at LOW address
1254 
1255         // ld_off is MSW so get LSW
1256         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1257                            next_off : ld_off;
1258         __ movq(r13, Address(saved_sp, offset));
1259         // st_off is LSW (i.e. reg.first())
1260         __ movq(Address(rsp, st_off), r13);
1261       }
1262     } else if (r_1->is_Register()) {  // Register argument
1263       Register r = r_1->as_Register();
1264       assert(r != rax, "must be different");
1265       if (r_2->is_valid()) {
1266         //
1267         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1268         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1269         // So we must adjust where to pick up the data to match the interpreter.
1270 
1271         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1272                            next_off : ld_off;
1273 
1274         // this can be a misaligned move
1275         __ movq(r, Address(saved_sp, offset));
1276       } else {
1277         // sign extend and use a full word?
1278         __ movl(r, Address(saved_sp, ld_off));
1279       }
1280     } else {
1281       if (!r_2->is_valid()) {
1282         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1283       } else {
1284         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1285       }
1286     }
1287   }
1288 
1289   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1290 
1291   // 6243940 We might end up in handle_wrong_method if
1292   // the callee is deoptimized as we race thru here. If that
1293   // happens we don't want to take a safepoint because the
1294   // caller frame will look interpreted and arguments are now
1295   // "compiled" so it is much better to make this transition
1296   // invisible to the stack walking code. Unfortunately if
1297   // we try and find the callee by normal means a safepoint
1298   // is possible. So we stash the desired callee in the thread
1299   // and the vm will find there should this case occur.
1300 
1301   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1302 
1303   // put Method* where a c2i would expect should we end up there
1304   // only needed because of c2 resolve stubs return Method* as a result in
1305   // rax
1306   __ mov(rax, rbx);
1307   __ jmp(r11);
1308 }
1309 
1310 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1311   Register data = rax;
1312   __ ic_check(1 /* end_alignment */);
1313   __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1314 
1315   // Method might have been compiled since the call site was patched to
1316   // interpreted if that is the case treat it as a miss so we can get
1317   // the call site corrected.
1318   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1319   __ jcc(Assembler::equal, skip_fixup);
1320   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1321 }
1322 
1323 // ---------------------------------------------------------------
1324 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1325                                                             int comp_args_on_stack,
1326                                                             const GrowableArray<SigEntry>* sig,
1327                                                             const VMRegPair* regs,
1328                                                             const GrowableArray<SigEntry>* sig_cc,
1329                                                             const VMRegPair* regs_cc,
1330                                                             const GrowableArray<SigEntry>* sig_cc_ro,
1331                                                             const VMRegPair* regs_cc_ro,
1332                                                             AdapterFingerPrint* fingerprint,
1333                                                             AdapterBlob*& new_adapter,
1334                                                             bool allocate_code_blob) {
1335   address i2c_entry = __ pc();
1336   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1337 
1338   // -------------------------------------------------------------------------
1339   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1340   // to the interpreter.  The args start out packed in the compiled layout.  They
1341   // need to be unpacked into the interpreter layout.  This will almost always
1342   // require some stack space.  We grow the current (compiled) stack, then repack
1343   // the args.  We  finally end in a jump to the generic interpreter entry point.
1344   // On exit from the interpreter, the interpreter will restore our SP (lest the
1345   // compiled code, which relies solely on SP and not RBP, get sick).
1346 
1347   address c2i_unverified_entry        = __ pc();
1348   address c2i_unverified_inline_entry = __ pc();
1349   Label skip_fixup;
1350 
1351   gen_inline_cache_check(masm, skip_fixup);
1352 
1353   OopMapSet* oop_maps = new OopMapSet();
1354   int frame_complete = CodeOffsets::frame_never_safe;
1355   int frame_size_in_words = 0;
1356 
1357   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1358   address c2i_no_clinit_check_entry = nullptr;
1359   address c2i_inline_ro_entry = __ pc();
1360   if (regs_cc != regs_cc_ro) {
1361     // No class init barrier needed because method is guaranteed to be non-static
1362     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, c2i_no_clinit_check_entry,
1363                     skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1364     skip_fixup.reset();
1365   }
1366 
1367   // Scalarized c2i adapter
1368   address c2i_entry        = __ pc();
1369   address c2i_inline_entry = __ pc();
1370   gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1371                   skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1372 
1373   // Non-scalarized c2i adapter
1374   if (regs != regs_cc) {
1375     c2i_unverified_inline_entry = __ pc();
1376     Label inline_entry_skip_fixup;
1377     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1378 
1379     c2i_inline_entry = __ pc();
1380     gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1381                     inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1382   }
1383 
1384   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1385   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1386   if (allocate_code_blob) {
1387     bool caller_must_gc_arguments = (regs != regs_cc);
1388     new_adapter = AdapterBlob::create(masm->code(), frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1389   }
1390 
1391   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry, c2i_unverified_inline_entry, c2i_no_clinit_check_entry);
1392 }
1393 
1394 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1395                                          VMRegPair *regs,
1396                                          int total_args_passed) {
1397 
1398 // We return the amount of VMRegImpl stack slots we need to reserve for all
1399 // the arguments NOT counting out_preserve_stack_slots.
1400 
1401 // NOTE: These arrays will have to change when c1 is ported
1402 #ifdef _WIN64
1403     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1404       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1405     };
1406     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1407       c_farg0, c_farg1, c_farg2, c_farg3
1408     };
1409 #else
1410     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1411       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1412     };
1413     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1414       c_farg0, c_farg1, c_farg2, c_farg3,
1415       c_farg4, c_farg5, c_farg6, c_farg7
1416     };
1417 #endif // _WIN64
1418 
1419 
1420     uint int_args = 0;
1421     uint fp_args = 0;
1422     uint stk_args = 0; // inc by 2 each time
1423 
1424     for (int i = 0; i < total_args_passed; i++) {
1425       switch (sig_bt[i]) {
1426       case T_BOOLEAN:
1427       case T_CHAR:
1428       case T_BYTE:
1429       case T_SHORT:
1430       case T_INT:
1431         if (int_args < Argument::n_int_register_parameters_c) {
1432           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1433 #ifdef _WIN64
1434           fp_args++;
1435           // Allocate slots for callee to stuff register args the stack.
1436           stk_args += 2;
1437 #endif
1438         } else {
1439           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1440           stk_args += 2;
1441         }
1442         break;
1443       case T_LONG:
1444         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1445         // fall through
1446       case T_OBJECT:
1447       case T_ARRAY:
1448       case T_ADDRESS:
1449       case T_METADATA:
1450         if (int_args < Argument::n_int_register_parameters_c) {
1451           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1452 #ifdef _WIN64
1453           fp_args++;
1454           stk_args += 2;
1455 #endif
1456         } else {
1457           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1458           stk_args += 2;
1459         }
1460         break;
1461       case T_FLOAT:
1462         if (fp_args < Argument::n_float_register_parameters_c) {
1463           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1464 #ifdef _WIN64
1465           int_args++;
1466           // Allocate slots for callee to stuff register args the stack.
1467           stk_args += 2;
1468 #endif
1469         } else {
1470           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1471           stk_args += 2;
1472         }
1473         break;
1474       case T_DOUBLE:
1475         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1476         if (fp_args < Argument::n_float_register_parameters_c) {
1477           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1478 #ifdef _WIN64
1479           int_args++;
1480           // Allocate slots for callee to stuff register args the stack.
1481           stk_args += 2;
1482 #endif
1483         } else {
1484           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1485           stk_args += 2;
1486         }
1487         break;
1488       case T_VOID: // Halves of longs and doubles
1489         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1490         regs[i].set_bad();
1491         break;
1492       default:
1493         ShouldNotReachHere();
1494         break;
1495       }
1496     }
1497 #ifdef _WIN64
1498   // windows abi requires that we always allocate enough stack space
1499   // for 4 64bit registers to be stored down.
1500   if (stk_args < 8) {
1501     stk_args = 8;
1502   }
1503 #endif // _WIN64
1504 
1505   return stk_args;
1506 }
1507 
1508 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1509                                              uint num_bits,
1510                                              uint total_args_passed) {
1511   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1512          "only certain vector sizes are supported for now");
1513 
1514   static const XMMRegister VEC_ArgReg[32] = {
1515      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1516      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1517     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1518     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1519   };
1520 
1521   uint stk_args = 0;
1522   uint fp_args = 0;
1523 
1524   for (uint i = 0; i < total_args_passed; i++) {
1525     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1526     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1527     regs[i].set_pair(vmreg->next(next_val), vmreg);
1528   }
1529 
1530   return stk_args;
1531 }
1532 
1533 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1534   // We always ignore the frame_slots arg and just use the space just below frame pointer
1535   // which by this time is free to use
1536   switch (ret_type) {
1537   case T_FLOAT:
1538     __ movflt(Address(rbp, -wordSize), xmm0);
1539     break;
1540   case T_DOUBLE:
1541     __ movdbl(Address(rbp, -wordSize), xmm0);
1542     break;
1543   case T_VOID:  break;
1544   default: {
1545     __ movptr(Address(rbp, -wordSize), rax);
1546     }
1547   }
1548 }
1549 
1550 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1551   // We always ignore the frame_slots arg and just use the space just below frame pointer
1552   // which by this time is free to use
1553   switch (ret_type) {
1554   case T_FLOAT:
1555     __ movflt(xmm0, Address(rbp, -wordSize));
1556     break;
1557   case T_DOUBLE:
1558     __ movdbl(xmm0, Address(rbp, -wordSize));
1559     break;
1560   case T_VOID:  break;
1561   default: {
1562     __ movptr(rax, Address(rbp, -wordSize));
1563     }
1564   }
1565 }
1566 
1567 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1568     for ( int i = first_arg ; i < arg_count ; i++ ) {
1569       if (args[i].first()->is_Register()) {
1570         __ push(args[i].first()->as_Register());
1571       } else if (args[i].first()->is_XMMRegister()) {
1572         __ subptr(rsp, 2*wordSize);
1573         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1574       }
1575     }
1576 }
1577 
1578 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1579     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1580       if (args[i].first()->is_Register()) {
1581         __ pop(args[i].first()->as_Register());
1582       } else if (args[i].first()->is_XMMRegister()) {
1583         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1584         __ addptr(rsp, 2*wordSize);
1585       }
1586     }
1587 }
1588 
1589 static void verify_oop_args(MacroAssembler* masm,
1590                             const methodHandle& method,
1591                             const BasicType* sig_bt,
1592                             const VMRegPair* regs) {
1593   Register temp_reg = rbx;  // not part of any compiled calling seq
1594   if (VerifyOops) {
1595     for (int i = 0; i < method->size_of_parameters(); i++) {
1596       if (is_reference_type(sig_bt[i])) {
1597         VMReg r = regs[i].first();
1598         assert(r->is_valid(), "bad oop arg");
1599         if (r->is_stack()) {
1600           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1601           __ verify_oop(temp_reg);
1602         } else {
1603           __ verify_oop(r->as_Register());
1604         }
1605       }
1606     }
1607   }
1608 }
1609 
1610 static void check_continuation_enter_argument(VMReg actual_vmreg,
1611                                               Register expected_reg,
1612                                               const char* name) {
1613   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1614   assert(actual_vmreg->as_Register() == expected_reg,
1615          "%s is in unexpected register: %s instead of %s",
1616          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1617 }
1618 
1619 
1620 //---------------------------- continuation_enter_setup ---------------------------
1621 //
1622 // Arguments:
1623 //   None.
1624 //
1625 // Results:
1626 //   rsp: pointer to blank ContinuationEntry
1627 //
1628 // Kills:
1629 //   rax
1630 //
1631 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1632   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1633   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1634   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1635 
1636   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1637   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1638 
1639   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1640   OopMap* map = new OopMap(frame_size, 0);
1641 
1642   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1643   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1644   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1645 
1646   return map;
1647 }
1648 
1649 //---------------------------- fill_continuation_entry ---------------------------
1650 //
1651 // Arguments:
1652 //   rsp: pointer to blank Continuation entry
1653 //   reg_cont_obj: pointer to the continuation
1654 //   reg_flags: flags
1655 //
1656 // Results:
1657 //   rsp: pointer to filled out ContinuationEntry
1658 //
1659 // Kills:
1660 //   rax
1661 //
1662 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1663   assert_different_registers(rax, reg_cont_obj, reg_flags);
1664 #ifdef ASSERT
1665   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1666 #endif
1667   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1668   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1669   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1670   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1671   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1672 
1673   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1674   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1675   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1676   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1677 
1678   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1679   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1680 }
1681 
1682 //---------------------------- continuation_enter_cleanup ---------------------------
1683 //
1684 // Arguments:
1685 //   rsp: pointer to the ContinuationEntry
1686 //
1687 // Results:
1688 //   rsp: pointer to the spilled rbp in the entry frame
1689 //
1690 // Kills:
1691 //   rbx
1692 //
1693 void static continuation_enter_cleanup(MacroAssembler* masm) {
1694 #ifdef ASSERT
1695   Label L_good_sp;
1696   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1697   __ jcc(Assembler::equal, L_good_sp);
1698   __ stop("Incorrect rsp at continuation_enter_cleanup");
1699   __ bind(L_good_sp);
1700 #endif
1701   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1702   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1703 
1704   if (CheckJNICalls) {
1705     // Check if this is a virtual thread continuation
1706     Label L_skip_vthread_code;
1707     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1708     __ jcc(Assembler::equal, L_skip_vthread_code);
1709 
1710     // If the held monitor count is > 0 and this vthread is terminating then
1711     // it failed to release a JNI monitor. So we issue the same log message
1712     // that JavaThread::exit does.
1713     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1714     __ jcc(Assembler::equal, L_skip_vthread_code);
1715 
1716     // rax may hold an exception oop, save it before the call
1717     __ push(rax);
1718     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1719     __ pop(rax);
1720 
1721     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1722     // on termination. The held count is implicitly zeroed below when we restore from
1723     // the parent held count (which has to be zero).
1724     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1725 
1726     __ bind(L_skip_vthread_code);
1727   }
1728 #ifdef ASSERT
1729   else {
1730     // Check if this is a virtual thread continuation
1731     Label L_skip_vthread_code;
1732     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1733     __ jcc(Assembler::equal, L_skip_vthread_code);
1734 
1735     // See comment just above. If not checking JNI calls the JNI count is only
1736     // needed for assertion checking.
1737     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1738 
1739     __ bind(L_skip_vthread_code);
1740   }
1741 #endif
1742 
1743   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1744   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1745 
1746   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1747   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1748   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1749 }
1750 
1751 static void gen_continuation_enter(MacroAssembler* masm,
1752                                    const VMRegPair* regs,
1753                                    int& exception_offset,
1754                                    OopMapSet* oop_maps,
1755                                    int& frame_complete,
1756                                    int& stack_slots,
1757                                    int& interpreted_entry_offset,
1758                                    int& compiled_entry_offset) {
1759 
1760   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1761   int pos_cont_obj   = 0;
1762   int pos_is_cont    = 1;
1763   int pos_is_virtual = 2;
1764 
1765   // The platform-specific calling convention may present the arguments in various registers.
1766   // To simplify the rest of the code, we expect the arguments to reside at these known
1767   // registers, and we additionally check the placement here in case calling convention ever
1768   // changes.
1769   Register reg_cont_obj   = c_rarg1;
1770   Register reg_is_cont    = c_rarg2;
1771   Register reg_is_virtual = c_rarg3;
1772 
1773   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1774   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1775   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1776 
1777   // Utility methods kill rax, make sure there are no collisions
1778   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1779 
1780   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1781                          relocInfo::static_call_type);
1782 
1783   address start = __ pc();
1784 
1785   Label L_thaw, L_exit;
1786 
1787   // i2i entry used at interp_only_mode only
1788   interpreted_entry_offset = __ pc() - start;
1789   {
1790 #ifdef ASSERT
1791     Label is_interp_only;
1792     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1793     __ jcc(Assembler::notEqual, is_interp_only);
1794     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1795     __ bind(is_interp_only);
1796 #endif
1797 
1798     __ pop(rax); // return address
1799     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1800     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1801     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1802     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1803     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1804     __ push(rax); // return address
1805     __ push_cont_fastpath();
1806 
1807     __ enter();
1808 
1809     stack_slots = 2; // will be adjusted in setup
1810     OopMap* map = continuation_enter_setup(masm, stack_slots);
1811     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1812     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1813 
1814     __ verify_oop(reg_cont_obj);
1815 
1816     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1817 
1818     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1819     __ testptr(reg_is_cont, reg_is_cont);
1820     __ jcc(Assembler::notZero, L_thaw);
1821 
1822     // --- Resolve path
1823 
1824     // Make sure the call is patchable
1825     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1826     // Emit stub for static call
1827     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1828     if (stub == nullptr) {
1829       fatal("CodeCache is full at gen_continuation_enter");
1830     }
1831     __ call(resolve);
1832     oop_maps->add_gc_map(__ pc() - start, map);
1833     __ post_call_nop();
1834 
1835     __ jmp(L_exit);
1836   }
1837 
1838   // compiled entry
1839   __ align(CodeEntryAlignment);
1840   compiled_entry_offset = __ pc() - start;
1841   __ enter();
1842 
1843   stack_slots = 2; // will be adjusted in setup
1844   OopMap* map = continuation_enter_setup(masm, stack_slots);
1845 
1846   // Frame is now completed as far as size and linkage.
1847   frame_complete = __ pc() - start;
1848 
1849   __ verify_oop(reg_cont_obj);
1850 
1851   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1852 
1853   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1854   __ testptr(reg_is_cont, reg_is_cont);
1855   __ jccb(Assembler::notZero, L_thaw);
1856 
1857   // --- call Continuation.enter(Continuation c, boolean isContinue)
1858 
1859   // Make sure the call is patchable
1860   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1861 
1862   // Emit stub for static call
1863   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1864   if (stub == nullptr) {
1865     fatal("CodeCache is full at gen_continuation_enter");
1866   }
1867 
1868   // The call needs to be resolved. There's a special case for this in
1869   // SharedRuntime::find_callee_info_helper() which calls
1870   // LinkResolver::resolve_continuation_enter() which resolves the call to
1871   // Continuation.enter(Continuation c, boolean isContinue).
1872   __ call(resolve);
1873 
1874   oop_maps->add_gc_map(__ pc() - start, map);
1875   __ post_call_nop();
1876 
1877   __ jmpb(L_exit);
1878 
1879   // --- Thawing path
1880 
1881   __ bind(L_thaw);
1882 
1883   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1884 
1885   ContinuationEntry::_return_pc_offset = __ pc() - start;
1886   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1887   __ post_call_nop();
1888 
1889   // --- Normal exit (resolve/thawing)
1890 
1891   __ bind(L_exit);
1892 
1893   continuation_enter_cleanup(masm);
1894   __ pop(rbp);
1895   __ ret(0);
1896 
1897   // --- Exception handling path
1898 
1899   exception_offset = __ pc() - start;
1900 
1901   continuation_enter_cleanup(masm);
1902   __ pop(rbp);
1903 
1904   __ movptr(c_rarg0, r15_thread);
1905   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1906 
1907   // rax still holds the original exception oop, save it before the call
1908   __ push(rax);
1909 
1910   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1911   __ movptr(rbx, rax);
1912 
1913   // Continue at exception handler:
1914   //   rax: exception oop
1915   //   rbx: exception handler
1916   //   rdx: exception pc
1917   __ pop(rax);
1918   __ verify_oop(rax);
1919   __ pop(rdx);
1920   __ jmp(rbx);
1921 }
1922 
1923 static void gen_continuation_yield(MacroAssembler* masm,
1924                                    const VMRegPair* regs,
1925                                    OopMapSet* oop_maps,
1926                                    int& frame_complete,
1927                                    int& stack_slots,
1928                                    int& compiled_entry_offset) {
1929   enum layout {
1930     rbp_off,
1931     rbpH_off,
1932     return_off,
1933     return_off2,
1934     framesize // inclusive of return address
1935   };
1936   stack_slots = framesize /  VMRegImpl::slots_per_word;
1937   assert(stack_slots == 2, "recheck layout");
1938 
1939   address start = __ pc();
1940   compiled_entry_offset = __ pc() - start;
1941   __ enter();
1942   address the_pc = __ pc();
1943 
1944   frame_complete = the_pc - start;
1945 
1946   // This nop must be exactly at the PC we push into the frame info.
1947   // We use this nop for fast CodeBlob lookup, associate the OopMap
1948   // with it right away.
1949   __ post_call_nop();
1950   OopMap* map = new OopMap(framesize, 1);
1951   oop_maps->add_gc_map(frame_complete, map);
1952 
1953   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1954   __ movptr(c_rarg0, r15_thread);
1955   __ movptr(c_rarg1, rsp);
1956   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1957   __ reset_last_Java_frame(true);
1958 
1959   Label L_pinned;
1960 
1961   __ testptr(rax, rax);
1962   __ jcc(Assembler::notZero, L_pinned);
1963 
1964   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1965   continuation_enter_cleanup(masm);
1966   __ pop(rbp);
1967   __ ret(0);
1968 
1969   __ bind(L_pinned);
1970 
1971   // Pinned, return to caller
1972 
1973   // handle pending exception thrown by freeze
1974   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1975   Label ok;
1976   __ jcc(Assembler::equal, ok);
1977   __ leave();
1978   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1979   __ bind(ok);
1980 
1981   __ leave();
1982   __ ret(0);
1983 }
1984 
1985 static void gen_special_dispatch(MacroAssembler* masm,
1986                                  const methodHandle& method,
1987                                  const BasicType* sig_bt,
1988                                  const VMRegPair* regs) {
1989   verify_oop_args(masm, method, sig_bt, regs);
1990   vmIntrinsics::ID iid = method->intrinsic_id();
1991 
1992   // Now write the args into the outgoing interpreter space
1993   bool     has_receiver   = false;
1994   Register receiver_reg   = noreg;
1995   int      member_arg_pos = -1;
1996   Register member_reg     = noreg;
1997   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1998   if (ref_kind != 0) {
1999     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
2000     member_reg = rbx;  // known to be free at this point
2001     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
2002   } else if (iid == vmIntrinsics::_invokeBasic) {
2003     has_receiver = true;
2004   } else if (iid == vmIntrinsics::_linkToNative) {
2005     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
2006     member_reg = rbx;  // known to be free at this point
2007   } else {
2008     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
2009   }
2010 
2011   if (member_reg != noreg) {
2012     // Load the member_arg into register, if necessary.
2013     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
2014     VMReg r = regs[member_arg_pos].first();
2015     if (r->is_stack()) {
2016       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
2017     } else {
2018       // no data motion is needed
2019       member_reg = r->as_Register();
2020     }
2021   }
2022 
2023   if (has_receiver) {
2024     // Make sure the receiver is loaded into a register.
2025     assert(method->size_of_parameters() > 0, "oob");
2026     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
2027     VMReg r = regs[0].first();
2028     assert(r->is_valid(), "bad receiver arg");
2029     if (r->is_stack()) {
2030       // Porting note:  This assumes that compiled calling conventions always
2031       // pass the receiver oop in a register.  If this is not true on some
2032       // platform, pick a temp and load the receiver from stack.
2033       fatal("receiver always in a register");
2034       receiver_reg = j_rarg0;  // known to be free at this point
2035       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
2036     } else {
2037       // no data motion is needed
2038       receiver_reg = r->as_Register();
2039     }
2040   }
2041 
2042   // Figure out which address we are really jumping to:
2043   MethodHandles::generate_method_handle_dispatch(masm, iid,
2044                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
2045 }
2046 
2047 // ---------------------------------------------------------------------------
2048 // Generate a native wrapper for a given method.  The method takes arguments
2049 // in the Java compiled code convention, marshals them to the native
2050 // convention (handlizes oops, etc), transitions to native, makes the call,
2051 // returns to java state (possibly blocking), unhandlizes any result and
2052 // returns.
2053 //
2054 // Critical native functions are a shorthand for the use of
2055 // GetPrimtiveArrayCritical and disallow the use of any other JNI
2056 // functions.  The wrapper is expected to unpack the arguments before
2057 // passing them to the callee. Critical native functions leave the state _in_Java,
2058 // since they cannot stop for GC.
2059 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
2060 // block and the check for pending exceptions it's impossible for them
2061 // to be thrown.
2062 //
2063 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
2064                                                 const methodHandle& method,
2065                                                 int compile_id,
2066                                                 BasicType* in_sig_bt,
2067                                                 VMRegPair* in_regs,
2068                                                 BasicType ret_type) {
2069   if (method->is_continuation_native_intrinsic()) {
2070     int exception_offset = -1;
2071     OopMapSet* oop_maps = new OopMapSet();
2072     int frame_complete = -1;
2073     int stack_slots = -1;
2074     int interpreted_entry_offset = -1;
2075     int vep_offset = -1;
2076     if (method->is_continuation_enter_intrinsic()) {
2077       gen_continuation_enter(masm,
2078                              in_regs,
2079                              exception_offset,
2080                              oop_maps,
2081                              frame_complete,
2082                              stack_slots,
2083                              interpreted_entry_offset,
2084                              vep_offset);
2085     } else if (method->is_continuation_yield_intrinsic()) {
2086       gen_continuation_yield(masm,
2087                              in_regs,
2088                              oop_maps,
2089                              frame_complete,
2090                              stack_slots,
2091                              vep_offset);
2092     } else {
2093       guarantee(false, "Unknown Continuation native intrinsic");
2094     }
2095 
2096 #ifdef ASSERT
2097     if (method->is_continuation_enter_intrinsic()) {
2098       assert(interpreted_entry_offset != -1, "Must be set");
2099       assert(exception_offset != -1,         "Must be set");
2100     } else {
2101       assert(interpreted_entry_offset == -1, "Must be unset");
2102       assert(exception_offset == -1,         "Must be unset");
2103     }
2104     assert(frame_complete != -1,    "Must be set");
2105     assert(stack_slots != -1,       "Must be set");
2106     assert(vep_offset != -1,        "Must be set");
2107 #endif
2108 
2109     __ flush();
2110     nmethod* nm = nmethod::new_native_nmethod(method,
2111                                               compile_id,
2112                                               masm->code(),
2113                                               vep_offset,
2114                                               frame_complete,
2115                                               stack_slots,
2116                                               in_ByteSize(-1),
2117                                               in_ByteSize(-1),
2118                                               oop_maps,
2119                                               exception_offset);
2120     if (nm == nullptr) return nm;
2121     if (method->is_continuation_enter_intrinsic()) {
2122       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2123     } else if (method->is_continuation_yield_intrinsic()) {
2124       _cont_doYield_stub = nm;
2125     }
2126     return nm;
2127   }
2128 
2129   if (method->is_method_handle_intrinsic()) {
2130     vmIntrinsics::ID iid = method->intrinsic_id();
2131     intptr_t start = (intptr_t)__ pc();
2132     int vep_offset = ((intptr_t)__ pc()) - start;
2133     gen_special_dispatch(masm,
2134                          method,
2135                          in_sig_bt,
2136                          in_regs);
2137     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
2138     __ flush();
2139     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
2140     return nmethod::new_native_nmethod(method,
2141                                        compile_id,
2142                                        masm->code(),
2143                                        vep_offset,
2144                                        frame_complete,
2145                                        stack_slots / VMRegImpl::slots_per_word,
2146                                        in_ByteSize(-1),
2147                                        in_ByteSize(-1),
2148                                        nullptr);
2149   }
2150   address native_func = method->native_function();
2151   assert(native_func != nullptr, "must have function");
2152 
2153   // An OopMap for lock (and class if static)
2154   OopMapSet *oop_maps = new OopMapSet();
2155   intptr_t start = (intptr_t)__ pc();
2156 
2157   // We have received a description of where all the java arg are located
2158   // on entry to the wrapper. We need to convert these args to where
2159   // the jni function will expect them. To figure out where they go
2160   // we convert the java signature to a C signature by inserting
2161   // the hidden arguments as arg[0] and possibly arg[1] (static method)
2162 
2163   const int total_in_args = method->size_of_parameters();
2164   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2165 
2166   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2167   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2168   BasicType* in_elem_bt = nullptr;
2169 
2170   int argc = 0;
2171   out_sig_bt[argc++] = T_ADDRESS;
2172   if (method->is_static()) {
2173     out_sig_bt[argc++] = T_OBJECT;
2174   }
2175 
2176   for (int i = 0; i < total_in_args ; i++ ) {
2177     out_sig_bt[argc++] = in_sig_bt[i];
2178   }
2179 
2180   // Now figure out where the args must be stored and how much stack space
2181   // they require.
2182   int out_arg_slots;
2183   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2184 
2185   // Compute framesize for the wrapper.  We need to handlize all oops in
2186   // incoming registers
2187 
2188   // Calculate the total number of stack slots we will need.
2189 
2190   // First count the abi requirement plus all of the outgoing args
2191   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2192 
2193   // Now the space for the inbound oop handle area
2194   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
2195 
2196   int oop_handle_offset = stack_slots;
2197   stack_slots += total_save_slots;
2198 
2199   // Now any space we need for handlizing a klass if static method
2200 
2201   int klass_slot_offset = 0;
2202   int klass_offset = -1;
2203   int lock_slot_offset = 0;
2204   bool is_static = false;
2205 
2206   if (method->is_static()) {
2207     klass_slot_offset = stack_slots;
2208     stack_slots += VMRegImpl::slots_per_word;
2209     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2210     is_static = true;
2211   }
2212 
2213   // Plus a lock if needed
2214 
2215   if (method->is_synchronized()) {
2216     lock_slot_offset = stack_slots;
2217     stack_slots += VMRegImpl::slots_per_word;
2218   }
2219 
2220   // Now a place (+2) to save return values or temp during shuffling
2221   // + 4 for return address (which we own) and saved rbp
2222   stack_slots += 6;
2223 
2224   // Ok The space we have allocated will look like:
2225   //
2226   //
2227   // FP-> |                     |
2228   //      |---------------------|
2229   //      | 2 slots for moves   |
2230   //      |---------------------|
2231   //      | lock box (if sync)  |
2232   //      |---------------------| <- lock_slot_offset
2233   //      | klass (if static)   |
2234   //      |---------------------| <- klass_slot_offset
2235   //      | oopHandle area      |
2236   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2237   //      | outbound memory     |
2238   //      | based arguments     |
2239   //      |                     |
2240   //      |---------------------|
2241   //      |                     |
2242   // SP-> | out_preserved_slots |
2243   //
2244   //
2245 
2246 
2247   // Now compute actual number of stack words we need rounding to make
2248   // stack properly aligned.
2249   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2250 
2251   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2252 
2253   // First thing make an ic check to see if we should even be here
2254 
2255   // We are free to use all registers as temps without saving them and
2256   // restoring them except rbp. rbp is the only callee save register
2257   // as far as the interpreter and the compiler(s) are concerned.
2258 
2259   const Register receiver = j_rarg0;
2260 
2261   Label exception_pending;
2262 
2263   assert_different_registers(receiver, rscratch1, rscratch2);
2264   __ verify_oop(receiver);
2265   __ ic_check(8 /* end_alignment */);
2266 
2267   int vep_offset = ((intptr_t)__ pc()) - start;
2268 
2269   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2270     Label L_skip_barrier;
2271     Register klass = r10;
2272     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2273     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2274 
2275     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2276 
2277     __ bind(L_skip_barrier);
2278   }
2279 
2280 #ifdef COMPILER1
2281   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2282   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2283     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2284   }
2285 #endif // COMPILER1
2286 
2287   // The instruction at the verified entry point must be 5 bytes or longer
2288   // because it can be patched on the fly by make_non_entrant. The stack bang
2289   // instruction fits that requirement.
2290 
2291   // Generate stack overflow check
2292   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2293 
2294   // Generate a new frame for the wrapper.
2295   __ enter();
2296   // -2 because return address is already present and so is saved rbp
2297   __ subptr(rsp, stack_size - 2*wordSize);
2298 
2299   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2300   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2301   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2302 
2303   // Frame is now completed as far as size and linkage.
2304   int frame_complete = ((intptr_t)__ pc()) - start;
2305 
2306 #ifdef ASSERT
2307   __ check_stack_alignment(rsp, "improperly aligned stack");
2308 #endif /* ASSERT */
2309 
2310 
2311   // We use r14 as the oop handle for the receiver/klass
2312   // It is callee save so it survives the call to native
2313 
2314   const Register oop_handle_reg = r14;
2315 
2316   //
2317   // We immediately shuffle the arguments so that any vm call we have to
2318   // make from here on out (sync slow path, jvmti, etc.) we will have
2319   // captured the oops from our caller and have a valid oopMap for
2320   // them.
2321 
2322   // -----------------
2323   // The Grand Shuffle
2324 
2325   // The Java calling convention is either equal (linux) or denser (win64) than the
2326   // c calling convention. However the because of the jni_env argument the c calling
2327   // convention always has at least one more (and two for static) arguments than Java.
2328   // Therefore if we move the args from java -> c backwards then we will never have
2329   // a register->register conflict and we don't have to build a dependency graph
2330   // and figure out how to break any cycles.
2331   //
2332 
2333   // Record esp-based slot for receiver on stack for non-static methods
2334   int receiver_offset = -1;
2335 
2336   // This is a trick. We double the stack slots so we can claim
2337   // the oops in the caller's frame. Since we are sure to have
2338   // more args than the caller doubling is enough to make
2339   // sure we can capture all the incoming oop args from the
2340   // caller.
2341   //
2342   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2343 
2344   // Mark location of rbp (someday)
2345   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2346 
2347   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2348   // All inbound args are referenced based on rbp and all outbound args via rsp.
2349 
2350 
2351 #ifdef ASSERT
2352   bool reg_destroyed[Register::number_of_registers];
2353   bool freg_destroyed[XMMRegister::number_of_registers];
2354   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2355     reg_destroyed[r] = false;
2356   }
2357   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2358     freg_destroyed[f] = false;
2359   }
2360 
2361 #endif /* ASSERT */
2362 
2363   // For JNI natives the incoming and outgoing registers are offset upwards.
2364   GrowableArray<int> arg_order(2 * total_in_args);
2365 
2366   VMRegPair tmp_vmreg;
2367   tmp_vmreg.set2(rbx->as_VMReg());
2368 
2369   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2370     arg_order.push(i);
2371     arg_order.push(c_arg);
2372   }
2373 
2374   int temploc = -1;
2375   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2376     int i = arg_order.at(ai);
2377     int c_arg = arg_order.at(ai + 1);
2378     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2379 #ifdef ASSERT
2380     if (in_regs[i].first()->is_Register()) {
2381       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2382     } else if (in_regs[i].first()->is_XMMRegister()) {
2383       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2384     }
2385     if (out_regs[c_arg].first()->is_Register()) {
2386       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2387     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2388       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2389     }
2390 #endif /* ASSERT */
2391     switch (in_sig_bt[i]) {
2392       case T_ARRAY:
2393       case T_OBJECT:
2394         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2395                     ((i == 0) && (!is_static)),
2396                     &receiver_offset);
2397         break;
2398       case T_VOID:
2399         break;
2400 
2401       case T_FLOAT:
2402         __ float_move(in_regs[i], out_regs[c_arg]);
2403           break;
2404 
2405       case T_DOUBLE:
2406         assert( i + 1 < total_in_args &&
2407                 in_sig_bt[i + 1] == T_VOID &&
2408                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2409         __ double_move(in_regs[i], out_regs[c_arg]);
2410         break;
2411 
2412       case T_LONG :
2413         __ long_move(in_regs[i], out_regs[c_arg]);
2414         break;
2415 
2416       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2417 
2418       default:
2419         __ move32_64(in_regs[i], out_regs[c_arg]);
2420     }
2421   }
2422 
2423   int c_arg;
2424 
2425   // Pre-load a static method's oop into r14.  Used both by locking code and
2426   // the normal JNI call code.
2427   // point c_arg at the first arg that is already loaded in case we
2428   // need to spill before we call out
2429   c_arg = total_c_args - total_in_args;
2430 
2431   if (method->is_static()) {
2432 
2433     //  load oop into a register
2434     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2435 
2436     // Now handlize the static class mirror it's known not-null.
2437     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2438     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2439 
2440     // Now get the handle
2441     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2442     // store the klass handle as second argument
2443     __ movptr(c_rarg1, oop_handle_reg);
2444     // and protect the arg if we must spill
2445     c_arg--;
2446   }
2447 
2448   // Change state to native (we save the return address in the thread, since it might not
2449   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2450   // points into the right code segment. It does not have to be the correct return pc.
2451   // We use the same pc/oopMap repeatedly when we call out
2452 
2453   intptr_t the_pc = (intptr_t) __ pc();
2454   oop_maps->add_gc_map(the_pc - start, map);
2455 
2456   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2457 
2458 
2459   // We have all of the arguments setup at this point. We must not touch any register
2460   // argument registers at this point (what if we save/restore them there are no oop?
2461 
2462   if (DTraceMethodProbes) {
2463     // protect the args we've loaded
2464     save_args(masm, total_c_args, c_arg, out_regs);
2465     __ mov_metadata(c_rarg1, method());
2466     __ call_VM_leaf(
2467       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2468       r15_thread, c_rarg1);
2469     restore_args(masm, total_c_args, c_arg, out_regs);
2470   }
2471 
2472   // RedefineClasses() tracing support for obsolete method entry
2473   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2474     // protect the args we've loaded
2475     save_args(masm, total_c_args, c_arg, out_regs);
2476     __ mov_metadata(c_rarg1, method());
2477     __ call_VM_leaf(
2478       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2479       r15_thread, c_rarg1);
2480     restore_args(masm, total_c_args, c_arg, out_regs);
2481   }
2482 
2483   // Lock a synchronized method
2484 
2485   // Register definitions used by locking and unlocking
2486 
2487   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2488   const Register obj_reg  = rbx;  // Will contain the oop
2489   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2490   const Register old_hdr  = r13;  // value of old header at unlock time
2491 
2492   Label slow_path_lock;
2493   Label lock_done;
2494 
2495   if (method->is_synchronized()) {
2496     Label count_mon;
2497 
2498     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2499 
2500     // Get the handle (the 2nd argument)
2501     __ mov(oop_handle_reg, c_rarg1);
2502 
2503     // Get address of the box
2504 
2505     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2506 
2507     // Load the oop from the handle
2508     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2509 
2510     if (LockingMode == LM_MONITOR) {
2511       __ jmp(slow_path_lock);
2512     } else if (LockingMode == LM_LEGACY) {
2513       // Load immediate 1 into swap_reg %rax
2514       __ movl(swap_reg, 1);
2515 
2516       // Load (object->mark() | 1) into swap_reg %rax
2517       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2518       if (EnableValhalla) {
2519         // Mask inline_type bit such that we go to the slow path if object is an inline type
2520         __ andptr(swap_reg, ~((int) markWord::inline_type_bit_in_place));
2521       }
2522 
2523       // Save (object->mark() | 1) into BasicLock's displaced header
2524       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2525 
2526       // src -> dest iff dest == rax else rax <- dest
2527       __ lock();
2528       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2529       __ jcc(Assembler::equal, count_mon);
2530 
2531       // Hmm should this move to the slow path code area???
2532 
2533       // Test if the oopMark is an obvious stack pointer, i.e.,
2534       //  1) (mark & 3) == 0, and
2535       //  2) rsp <= mark < mark + os::pagesize()
2536       // These 3 tests can be done by evaluating the following
2537       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2538       // assuming both stack pointer and pagesize have their
2539       // least significant 2 bits clear.
2540       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2541 
2542       __ subptr(swap_reg, rsp);
2543       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2544 
2545       // Save the test result, for recursive case, the result is zero
2546       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2547       __ jcc(Assembler::notEqual, slow_path_lock);
2548     } else {
2549       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2550       __ lightweight_lock(lock_reg, obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2551     }
2552     __ bind(count_mon);
2553     __ inc_held_monitor_count();
2554 
2555     // Slow path will re-enter here
2556     __ bind(lock_done);
2557   }
2558 
2559   // Finally just about ready to make the JNI call
2560 
2561   // get JNIEnv* which is first argument to native
2562   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2563 
2564   // Now set thread in native
2565   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2566 
2567   __ call(RuntimeAddress(native_func));
2568 
2569   // Verify or restore cpu control state after JNI call
2570   __ restore_cpu_control_state_after_jni(rscratch1);
2571 
2572   // Unpack native results.
2573   switch (ret_type) {
2574   case T_BOOLEAN: __ c2bool(rax);            break;
2575   case T_CHAR   : __ movzwl(rax, rax);      break;
2576   case T_BYTE   : __ sign_extend_byte (rax); break;
2577   case T_SHORT  : __ sign_extend_short(rax); break;
2578   case T_INT    : /* nothing to do */        break;
2579   case T_DOUBLE :
2580   case T_FLOAT  :
2581     // Result is in xmm0 we'll save as needed
2582     break;
2583   case T_ARRAY:                 // Really a handle
2584   case T_OBJECT:                // Really a handle
2585       break; // can't de-handlize until after safepoint check
2586   case T_VOID: break;
2587   case T_LONG: break;
2588   default       : ShouldNotReachHere();
2589   }
2590 
2591   Label after_transition;
2592 
2593   // Switch thread to "native transition" state before reading the synchronization state.
2594   // This additional state is necessary because reading and testing the synchronization
2595   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2596   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2597   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2598   //     Thread A is resumed to finish this native method, but doesn't block here since it
2599   //     didn't see any synchronization is progress, and escapes.
2600   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2601 
2602   // Force this write out before the read below
2603   if (!UseSystemMemoryBarrier) {
2604     __ membar(Assembler::Membar_mask_bits(
2605               Assembler::LoadLoad | Assembler::LoadStore |
2606               Assembler::StoreLoad | Assembler::StoreStore));
2607   }
2608 
2609   // check for safepoint operation in progress and/or pending suspend requests
2610   {
2611     Label Continue;
2612     Label slow_path;
2613 
2614     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2615 
2616     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2617     __ jcc(Assembler::equal, Continue);
2618     __ bind(slow_path);
2619 
2620     // Don't use call_VM as it will see a possible pending exception and forward it
2621     // and never return here preventing us from clearing _last_native_pc down below.
2622     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2623     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2624     // by hand.
2625     //
2626     __ vzeroupper();
2627     save_native_result(masm, ret_type, stack_slots);
2628     __ mov(c_rarg0, r15_thread);
2629     __ mov(r12, rsp); // remember sp
2630     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2631     __ andptr(rsp, -16); // align stack as required by ABI
2632     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2633     __ mov(rsp, r12); // restore sp
2634     __ reinit_heapbase();
2635     // Restore any method result value
2636     restore_native_result(masm, ret_type, stack_slots);
2637     __ bind(Continue);
2638   }
2639 
2640   // change thread state
2641   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2642   __ bind(after_transition);
2643 
2644   Label reguard;
2645   Label reguard_done;
2646   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2647   __ jcc(Assembler::equal, reguard);
2648   __ bind(reguard_done);
2649 
2650   // native result if any is live
2651 
2652   // Unlock
2653   Label slow_path_unlock;
2654   Label unlock_done;
2655   if (method->is_synchronized()) {
2656 
2657     Label fast_done;
2658 
2659     // Get locked oop from the handle we passed to jni
2660     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2661 
2662     if (LockingMode == LM_LEGACY) {
2663       Label not_recur;
2664       // Simple recursive lock?
2665       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2666       __ jcc(Assembler::notEqual, not_recur);
2667       __ dec_held_monitor_count();
2668       __ jmpb(fast_done);
2669       __ bind(not_recur);
2670     }
2671 
2672     // Must save rax if it is live now because cmpxchg must use it
2673     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2674       save_native_result(masm, ret_type, stack_slots);
2675     }
2676 
2677     if (LockingMode == LM_MONITOR) {
2678       __ jmp(slow_path_unlock);
2679     } else if (LockingMode == LM_LEGACY) {
2680       // get address of the stack lock
2681       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2682       //  get old displaced header
2683       __ movptr(old_hdr, Address(rax, 0));
2684 
2685       // Atomic swap old header if oop still contains the stack lock
2686       __ lock();
2687       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2688       __ jcc(Assembler::notEqual, slow_path_unlock);
2689       __ dec_held_monitor_count();
2690     } else {
2691       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2692       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2693       __ dec_held_monitor_count();
2694     }
2695 
2696     // slow path re-enters here
2697     __ bind(unlock_done);
2698     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2699       restore_native_result(masm, ret_type, stack_slots);
2700     }
2701 
2702     __ bind(fast_done);
2703   }
2704   if (DTraceMethodProbes) {
2705     save_native_result(masm, ret_type, stack_slots);
2706     __ mov_metadata(c_rarg1, method());
2707     __ call_VM_leaf(
2708          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2709          r15_thread, c_rarg1);
2710     restore_native_result(masm, ret_type, stack_slots);
2711   }
2712 
2713   __ reset_last_Java_frame(false);
2714 
2715   // Unbox oop result, e.g. JNIHandles::resolve value.
2716   if (is_reference_type(ret_type)) {
2717     __ resolve_jobject(rax /* value */,
2718                        r15_thread /* thread */,
2719                        rcx /* tmp */);
2720   }
2721 
2722   if (CheckJNICalls) {
2723     // clear_pending_jni_exception_check
2724     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2725   }
2726 
2727   // reset handle block
2728   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2729   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2730 
2731   // pop our frame
2732 
2733   __ leave();
2734 
2735   // Any exception pending?
2736   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2737   __ jcc(Assembler::notEqual, exception_pending);
2738 
2739   // Return
2740 
2741   __ ret(0);
2742 
2743   // Unexpected paths are out of line and go here
2744 
2745   // forward the exception
2746   __ bind(exception_pending);
2747 
2748   // and forward the exception
2749   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2750 
2751   // Slow path locking & unlocking
2752   if (method->is_synchronized()) {
2753 
2754     // BEGIN Slow path lock
2755     __ bind(slow_path_lock);
2756 
2757     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2758     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2759 
2760     // protect the args we've loaded
2761     save_args(masm, total_c_args, c_arg, out_regs);
2762 
2763     __ mov(c_rarg0, obj_reg);
2764     __ mov(c_rarg1, lock_reg);
2765     __ mov(c_rarg2, r15_thread);
2766 
2767     // Not a leaf but we have last_Java_frame setup as we want
2768     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2769     restore_args(masm, total_c_args, c_arg, out_regs);
2770 
2771 #ifdef ASSERT
2772     { Label L;
2773     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2774     __ jcc(Assembler::equal, L);
2775     __ stop("no pending exception allowed on exit from monitorenter");
2776     __ bind(L);
2777     }
2778 #endif
2779     __ jmp(lock_done);
2780 
2781     // END Slow path lock
2782 
2783     // BEGIN Slow path unlock
2784     __ bind(slow_path_unlock);
2785 
2786     // If we haven't already saved the native result we must save it now as xmm registers
2787     // are still exposed.
2788     __ vzeroupper();
2789     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2790       save_native_result(masm, ret_type, stack_slots);
2791     }
2792 
2793     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2794 
2795     __ mov(c_rarg0, obj_reg);
2796     __ mov(c_rarg2, r15_thread);
2797     __ mov(r12, rsp); // remember sp
2798     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2799     __ andptr(rsp, -16); // align stack as required by ABI
2800 
2801     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2802     // NOTE that obj_reg == rbx currently
2803     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2804     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2805 
2806     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2807     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2808     __ mov(rsp, r12); // restore sp
2809     __ reinit_heapbase();
2810 #ifdef ASSERT
2811     {
2812       Label L;
2813       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2814       __ jcc(Assembler::equal, L);
2815       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2816       __ bind(L);
2817     }
2818 #endif /* ASSERT */
2819 
2820     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2821 
2822     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2823       restore_native_result(masm, ret_type, stack_slots);
2824     }
2825     __ jmp(unlock_done);
2826 
2827     // END Slow path unlock
2828 
2829   } // synchronized
2830 
2831   // SLOW PATH Reguard the stack if needed
2832 
2833   __ bind(reguard);
2834   __ vzeroupper();
2835   save_native_result(masm, ret_type, stack_slots);
2836   __ mov(r12, rsp); // remember sp
2837   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2838   __ andptr(rsp, -16); // align stack as required by ABI
2839   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2840   __ mov(rsp, r12); // restore sp
2841   __ reinit_heapbase();
2842   restore_native_result(masm, ret_type, stack_slots);
2843   // and continue
2844   __ jmp(reguard_done);
2845 
2846 
2847 
2848   __ flush();
2849 
2850   nmethod *nm = nmethod::new_native_nmethod(method,
2851                                             compile_id,
2852                                             masm->code(),
2853                                             vep_offset,
2854                                             frame_complete,
2855                                             stack_slots / VMRegImpl::slots_per_word,
2856                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2857                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2858                                             oop_maps);
2859 
2860   return nm;
2861 }
2862 
2863 // this function returns the adjust size (in number of words) to a c2i adapter
2864 // activation for use during deoptimization
2865 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2866   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2867 }
2868 
2869 
2870 uint SharedRuntime::out_preserve_stack_slots() {
2871   return 0;
2872 }
2873 
2874 
2875 // Number of stack slots between incoming argument block and the start of
2876 // a new frame.  The PROLOG must add this many slots to the stack.  The
2877 // EPILOG must remove this many slots.  amd64 needs two slots for
2878 // return address.
2879 uint SharedRuntime::in_preserve_stack_slots() {
2880   return 4 + 2 * VerifyStackAtCalls;
2881 }
2882 
2883 //------------------------------generate_deopt_blob----------------------------
2884 void SharedRuntime::generate_deopt_blob() {
2885   // Allocate space for the code
2886   ResourceMark rm;
2887   // Setup code generation tools
2888   int pad = 0;
2889   if (UseAVX > 2) {
2890     pad += 1024;
2891   }
2892   if (UseAPX) {
2893     pad += 1024;
2894   }
2895 #if INCLUDE_JVMCI
2896   if (EnableJVMCI) {
2897     pad += 512; // Increase the buffer size when compiling for JVMCI
2898   }
2899 #endif
2900   const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id);
2901   CodeBuffer buffer(name, 2560+pad, 1024);
2902   MacroAssembler* masm = new MacroAssembler(&buffer);
2903   int frame_size_in_words;
2904   OopMap* map = nullptr;
2905   OopMapSet *oop_maps = new OopMapSet();
2906 
2907   // -------------
2908   // This code enters when returning to a de-optimized nmethod.  A return
2909   // address has been pushed on the stack, and return values are in
2910   // registers.
2911   // If we are doing a normal deopt then we were called from the patched
2912   // nmethod from the point we returned to the nmethod. So the return
2913   // address on the stack is wrong by NativeCall::instruction_size
2914   // We will adjust the value so it looks like we have the original return
2915   // address on the stack (like when we eagerly deoptimized).
2916   // In the case of an exception pending when deoptimizing, we enter
2917   // with a return address on the stack that points after the call we patched
2918   // into the exception handler. We have the following register state from,
2919   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2920   //    rax: exception oop
2921   //    rbx: exception handler
2922   //    rdx: throwing pc
2923   // So in this case we simply jam rdx into the useless return address and
2924   // the stack looks just like we want.
2925   //
2926   // At this point we need to de-opt.  We save the argument return
2927   // registers.  We call the first C routine, fetch_unroll_info().  This
2928   // routine captures the return values and returns a structure which
2929   // describes the current frame size and the sizes of all replacement frames.
2930   // The current frame is compiled code and may contain many inlined
2931   // functions, each with their own JVM state.  We pop the current frame, then
2932   // push all the new frames.  Then we call the C routine unpack_frames() to
2933   // populate these frames.  Finally unpack_frames() returns us the new target
2934   // address.  Notice that callee-save registers are BLOWN here; they have
2935   // already been captured in the vframeArray at the time the return PC was
2936   // patched.
2937   address start = __ pc();
2938   Label cont;
2939 
2940   // Prolog for non exception case!
2941 
2942   // Save everything in sight.
2943   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2944 
2945   // Normal deoptimization.  Save exec mode for unpack_frames.
2946   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2947   __ jmp(cont);
2948 
2949   int reexecute_offset = __ pc() - start;
2950 #if INCLUDE_JVMCI && !defined(COMPILER1)
2951   if (UseJVMCICompiler) {
2952     // JVMCI does not use this kind of deoptimization
2953     __ should_not_reach_here();
2954   }
2955 #endif
2956 
2957   // Reexecute case
2958   // return address is the pc describes what bci to do re-execute at
2959 
2960   // No need to update map as each call to save_live_registers will produce identical oopmap
2961   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2962 
2963   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2964   __ jmp(cont);
2965 
2966 #if INCLUDE_JVMCI
2967   Label after_fetch_unroll_info_call;
2968   int implicit_exception_uncommon_trap_offset = 0;
2969   int uncommon_trap_offset = 0;
2970 
2971   if (EnableJVMCI) {
2972     implicit_exception_uncommon_trap_offset = __ pc() - start;
2973 
2974     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2975     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2976 
2977     uncommon_trap_offset = __ pc() - start;
2978 
2979     // Save everything in sight.
2980     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2981     // fetch_unroll_info needs to call last_java_frame()
2982     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2983 
2984     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2985     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2986 
2987     __ movl(r14, Deoptimization::Unpack_reexecute);
2988     __ mov(c_rarg0, r15_thread);
2989     __ movl(c_rarg2, r14); // exec mode
2990     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2991     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2992 
2993     __ reset_last_Java_frame(false);
2994 
2995     __ jmp(after_fetch_unroll_info_call);
2996   } // EnableJVMCI
2997 #endif // INCLUDE_JVMCI
2998 
2999   int exception_offset = __ pc() - start;
3000 
3001   // Prolog for exception case
3002 
3003   // all registers are dead at this entry point, except for rax, and
3004   // rdx which contain the exception oop and exception pc
3005   // respectively.  Set them in TLS and fall thru to the
3006   // unpack_with_exception_in_tls entry point.
3007 
3008   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3009   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
3010 
3011   int exception_in_tls_offset = __ pc() - start;
3012 
3013   // new implementation because exception oop is now passed in JavaThread
3014 
3015   // Prolog for exception case
3016   // All registers must be preserved because they might be used by LinearScan
3017   // Exceptiop oop and throwing PC are passed in JavaThread
3018   // tos: stack at point of call to method that threw the exception (i.e. only
3019   // args are on the stack, no return address)
3020 
3021   // make room on stack for the return address
3022   // It will be patched later with the throwing pc. The correct value is not
3023   // available now because loading it from memory would destroy registers.
3024   __ push(0);
3025 
3026   // Save everything in sight.
3027   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
3028 
3029   // Now it is safe to overwrite any register
3030 
3031   // Deopt during an exception.  Save exec mode for unpack_frames.
3032   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
3033 
3034   // load throwing pc from JavaThread and patch it as the return address
3035   // of the current frame. Then clear the field in JavaThread
3036 
3037   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3038   __ movptr(Address(rbp, wordSize), rdx);
3039   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3040 
3041 #ifdef ASSERT
3042   // verify that there is really an exception oop in JavaThread
3043   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3044   __ verify_oop(rax);
3045 
3046   // verify that there is no pending exception
3047   Label no_pending_exception;
3048   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3049   __ testptr(rax, rax);
3050   __ jcc(Assembler::zero, no_pending_exception);
3051   __ stop("must not have pending exception here");
3052   __ bind(no_pending_exception);
3053 #endif
3054 
3055   __ bind(cont);
3056 
3057   // Call C code.  Need thread and this frame, but NOT official VM entry
3058   // crud.  We cannot block on this call, no GC can happen.
3059   //
3060   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
3061 
3062   // fetch_unroll_info needs to call last_java_frame().
3063 
3064   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3065 #ifdef ASSERT
3066   { Label L;
3067     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3068     __ jcc(Assembler::equal, L);
3069     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
3070     __ bind(L);
3071   }
3072 #endif // ASSERT
3073   __ mov(c_rarg0, r15_thread);
3074   __ movl(c_rarg1, r14); // exec_mode
3075   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
3076 
3077   // Need to have an oopmap that tells fetch_unroll_info where to
3078   // find any register it might need.
3079   oop_maps->add_gc_map(__ pc() - start, map);
3080 
3081   __ reset_last_Java_frame(false);
3082 
3083 #if INCLUDE_JVMCI
3084   if (EnableJVMCI) {
3085     __ bind(after_fetch_unroll_info_call);
3086   }
3087 #endif
3088 
3089   // Load UnrollBlock* into rdi
3090   __ mov(rdi, rax);
3091 
3092   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
3093    Label noException;
3094   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
3095   __ jcc(Assembler::notEqual, noException);
3096   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3097   // QQQ this is useless it was null above
3098   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3099   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3100   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3101 
3102   __ verify_oop(rax);
3103 
3104   // Overwrite the result registers with the exception results.
3105   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3106   // I think this is useless
3107   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
3108 
3109   __ bind(noException);
3110 
3111   // Only register save data is on the stack.
3112   // Now restore the result registers.  Everything else is either dead
3113   // or captured in the vframeArray.
3114   RegisterSaver::restore_result_registers(masm);
3115 
3116   // All of the register save area has been popped of the stack. Only the
3117   // return address remains.
3118 
3119   // Pop all the frames we must move/replace.
3120   //
3121   // Frame picture (youngest to oldest)
3122   // 1: self-frame (no frame link)
3123   // 2: deopting frame  (no frame link)
3124   // 3: caller of deopting frame (could be compiled/interpreted).
3125   //
3126   // Note: by leaving the return address of self-frame on the stack
3127   // and using the size of frame 2 to adjust the stack
3128   // when we are done the return to frame 3 will still be on the stack.
3129 
3130   // Pop deoptimized frame
3131   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3132   __ addptr(rsp, rcx);
3133 
3134   // rsp should be pointing at the return address to the caller (3)
3135 
3136   // Pick up the initial fp we should save
3137   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3138   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3139 
3140 #ifdef ASSERT
3141   // Compilers generate code that bang the stack by as much as the
3142   // interpreter would need. So this stack banging should never
3143   // trigger a fault. Verify that it does not on non product builds.
3144   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3145   __ bang_stack_size(rbx, rcx);
3146 #endif
3147 
3148   // Load address of array of frame pcs into rcx
3149   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3150 
3151   // Trash the old pc
3152   __ addptr(rsp, wordSize);
3153 
3154   // Load address of array of frame sizes into rsi
3155   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3156 
3157   // Load counter into rdx
3158   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3159 
3160   // Now adjust the caller's stack to make up for the extra locals
3161   // but record the original sp so that we can save it in the skeletal interpreter
3162   // frame and the stack walking of interpreter_sender will get the unextended sp
3163   // value and not the "real" sp value.
3164 
3165   const Register sender_sp = r8;
3166 
3167   __ mov(sender_sp, rsp);
3168   __ movl(rbx, Address(rdi,
3169                        Deoptimization::UnrollBlock::
3170                        caller_adjustment_offset()));
3171   __ subptr(rsp, rbx);
3172 
3173   // Push interpreter frames in a loop
3174   Label loop;
3175   __ bind(loop);
3176   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3177   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3178   __ pushptr(Address(rcx, 0));          // Save return address
3179   __ enter();                           // Save old & set new ebp
3180   __ subptr(rsp, rbx);                  // Prolog
3181   // This value is corrected by layout_activation_impl
3182   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3183   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3184   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3185   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3186   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3187   __ decrementl(rdx);                   // Decrement counter
3188   __ jcc(Assembler::notZero, loop);
3189   __ pushptr(Address(rcx, 0));          // Save final return address
3190 
3191   // Re-push self-frame
3192   __ enter();                           // Save old & set new ebp
3193 
3194   // Allocate a full sized register save area.
3195   // Return address and rbp are in place, so we allocate two less words.
3196   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3197 
3198   // Restore frame locals after moving the frame
3199   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3200   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3201 
3202   // Call C code.  Need thread but NOT official VM entry
3203   // crud.  We cannot block on this call, no GC can happen.  Call should
3204   // restore return values to their stack-slots with the new SP.
3205   //
3206   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3207 
3208   // Use rbp because the frames look interpreted now
3209   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3210   // Don't need the precise return PC here, just precise enough to point into this code blob.
3211   address the_pc = __ pc();
3212   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3213 
3214   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3215   __ mov(c_rarg0, r15_thread);
3216   __ movl(c_rarg1, r14); // second arg: exec_mode
3217   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3218   // Revert SP alignment after call since we're going to do some SP relative addressing below
3219   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3220 
3221   // Set an oopmap for the call site
3222   // Use the same PC we used for the last java frame
3223   oop_maps->add_gc_map(the_pc - start,
3224                        new OopMap( frame_size_in_words, 0 ));
3225 
3226   // Clear fp AND pc
3227   __ reset_last_Java_frame(true);
3228 
3229   // Collect return values
3230   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3231   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3232   // I think this is useless (throwing pc?)
3233   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3234 
3235   // Pop self-frame.
3236   __ leave();                           // Epilog
3237 
3238   // Jump to interpreter
3239   __ ret(0);
3240 
3241   // Make sure all code is generated
3242   masm->flush();
3243 
3244   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3245   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3246 #if INCLUDE_JVMCI
3247   if (EnableJVMCI) {
3248     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3249     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3250   }
3251 #endif
3252 }
3253 
3254 //------------------------------generate_handler_blob------
3255 //
3256 // Generate a special Compile2Runtime blob that saves all registers,
3257 // and setup oopmap.
3258 //
3259 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) {
3260   assert(StubRoutines::forward_exception_entry() != nullptr,
3261          "must be generated before");
3262   assert(is_polling_page_id(id), "expected a polling page stub id");
3263 
3264   ResourceMark rm;
3265   OopMapSet *oop_maps = new OopMapSet();
3266   OopMap* map;
3267 
3268   // Allocate space for the code.  Setup code generation tools.
3269   const char* name = SharedRuntime::stub_name(id);
3270   CodeBuffer buffer(name, 2348, 1024);
3271   MacroAssembler* masm = new MacroAssembler(&buffer);
3272 
3273   address start   = __ pc();
3274   address call_pc = nullptr;
3275   int frame_size_in_words;
3276   bool cause_return = (id == SharedStubId::polling_page_return_handler_id);
3277   bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id);
3278 
3279   // Make room for return address (or push it again)
3280   if (!cause_return) {
3281     __ push(rbx);
3282   }
3283 
3284   // Save registers, fpu state, and flags
3285   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3286 
3287   // The following is basically a call_VM.  However, we need the precise
3288   // address of the call in order to generate an oopmap. Hence, we do all the
3289   // work ourselves.
3290 
3291   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3292 
3293   // The return address must always be correct so that frame constructor never
3294   // sees an invalid pc.
3295 
3296   if (!cause_return) {
3297     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3298     // Additionally, rbx is a callee saved register and we can look at it later to determine
3299     // if someone changed the return address for us!
3300     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3301     __ movptr(Address(rbp, wordSize), rbx);
3302   }
3303 
3304   // Do the call
3305   __ mov(c_rarg0, r15_thread);
3306   __ call(RuntimeAddress(call_ptr));
3307 
3308   // Set an oopmap for the call site.  This oopmap will map all
3309   // oop-registers and debug-info registers as callee-saved.  This
3310   // will allow deoptimization at this safepoint to find all possible
3311   // debug-info recordings, as well as let GC find all oops.
3312 
3313   oop_maps->add_gc_map( __ pc() - start, map);
3314 
3315   Label noException;
3316 
3317   __ reset_last_Java_frame(false);
3318 
3319   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3320   __ jcc(Assembler::equal, noException);
3321 
3322   // Exception pending
3323 
3324   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3325 
3326   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3327 
3328   // No exception case
3329   __ bind(noException);
3330 
3331   Label no_adjust;
3332 #ifdef ASSERT
3333   Label bail;
3334 #endif
3335   if (!cause_return) {
3336     Label no_prefix, not_special;
3337 
3338     // If our stashed return pc was modified by the runtime we avoid touching it
3339     __ cmpptr(rbx, Address(rbp, wordSize));
3340     __ jccb(Assembler::notEqual, no_adjust);
3341 
3342     // Skip over the poll instruction.
3343     // See NativeInstruction::is_safepoint_poll()
3344     // Possible encodings:
3345     //      85 00       test   %eax,(%rax)
3346     //      85 01       test   %eax,(%rcx)
3347     //      85 02       test   %eax,(%rdx)
3348     //      85 03       test   %eax,(%rbx)
3349     //      85 06       test   %eax,(%rsi)
3350     //      85 07       test   %eax,(%rdi)
3351     //
3352     //   41 85 00       test   %eax,(%r8)
3353     //   41 85 01       test   %eax,(%r9)
3354     //   41 85 02       test   %eax,(%r10)
3355     //   41 85 03       test   %eax,(%r11)
3356     //   41 85 06       test   %eax,(%r14)
3357     //   41 85 07       test   %eax,(%r15)
3358     //
3359     //      85 04 24    test   %eax,(%rsp)
3360     //   41 85 04 24    test   %eax,(%r12)
3361     //      85 45 00    test   %eax,0x0(%rbp)
3362     //   41 85 45 00    test   %eax,0x0(%r13)
3363 
3364     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3365     __ jcc(Assembler::notEqual, no_prefix);
3366     __ addptr(rbx, 1);
3367     __ bind(no_prefix);
3368 #ifdef ASSERT
3369     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3370 #endif
3371     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3372     // r12/rsp 0x04
3373     // r13/rbp 0x05
3374     __ movzbq(rcx, Address(rbx, 1));
3375     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3376     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3377     __ cmpptr(rcx, 1);
3378     __ jcc(Assembler::above, not_special);
3379     __ addptr(rbx, 1);
3380     __ bind(not_special);
3381 #ifdef ASSERT
3382     // Verify the correct encoding of the poll we're about to skip.
3383     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3384     __ jcc(Assembler::notEqual, bail);
3385     // Mask out the modrm bits
3386     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3387     // rax encodes to 0, so if the bits are nonzero it's incorrect
3388     __ jcc(Assembler::notZero, bail);
3389 #endif
3390     // Adjust return pc forward to step over the safepoint poll instruction
3391     __ addptr(rbx, 2);
3392     __ movptr(Address(rbp, wordSize), rbx);
3393   }
3394 
3395   __ bind(no_adjust);
3396   // Normal exit, restore registers and exit.
3397   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3398   __ ret(0);
3399 
3400 #ifdef ASSERT
3401   __ bind(bail);
3402   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3403 #endif
3404 
3405   // Make sure all code is generated
3406   masm->flush();
3407 
3408   // Fill-out other meta info
3409   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3410 }
3411 
3412 //
3413 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3414 //
3415 // Generate a stub that calls into vm to find out the proper destination
3416 // of a java call. All the argument registers are live at this point
3417 // but since this is generic code we don't know what they are and the caller
3418 // must do any gc of the args.
3419 //
3420 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) {
3421   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3422   assert(is_resolve_id(id), "expected a resolve stub id");
3423 
3424   // allocate space for the code
3425   ResourceMark rm;
3426 
3427   const char* name = SharedRuntime::stub_name(id);
3428   CodeBuffer buffer(name, 1552, 512);
3429   MacroAssembler* masm = new MacroAssembler(&buffer);
3430 
3431   int frame_size_in_words;
3432 
3433   OopMapSet *oop_maps = new OopMapSet();
3434   OopMap* map = nullptr;
3435 
3436   int start = __ offset();
3437 
3438   // No need to save vector registers since they are caller-saved anyway.
3439   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3440 
3441   int frame_complete = __ offset();
3442 
3443   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3444 
3445   __ mov(c_rarg0, r15_thread);
3446 
3447   __ call(RuntimeAddress(destination));
3448 
3449 
3450   // Set an oopmap for the call site.
3451   // We need this not only for callee-saved registers, but also for volatile
3452   // registers that the compiler might be keeping live across a safepoint.
3453 
3454   oop_maps->add_gc_map( __ offset() - start, map);
3455 
3456   // rax contains the address we are going to jump to assuming no exception got installed
3457 
3458   // clear last_Java_sp
3459   __ reset_last_Java_frame(false);
3460   // check for pending exceptions
3461   Label pending;
3462   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3463   __ jcc(Assembler::notEqual, pending);
3464 
3465   // get the returned Method*
3466   __ get_vm_result_2(rbx, r15_thread);
3467   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3468 
3469   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3470 
3471   RegisterSaver::restore_live_registers(masm);
3472 
3473   // We are back to the original state on entry and ready to go.
3474 
3475   __ jmp(rax);
3476 
3477   // Pending exception after the safepoint
3478 
3479   __ bind(pending);
3480 
3481   RegisterSaver::restore_live_registers(masm);
3482 
3483   // exception pending => remove activation and forward to exception handler
3484 
3485   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3486 
3487   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3488   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3489 
3490   // -------------
3491   // make sure all code is generated
3492   masm->flush();
3493 
3494   // return the  blob
3495   // frame_size_words or bytes??
3496   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3497 }
3498 
3499 // Continuation point for throwing of implicit exceptions that are
3500 // not handled in the current activation. Fabricates an exception
3501 // oop and initiates normal exception dispatching in this
3502 // frame. Since we need to preserve callee-saved values (currently
3503 // only for C2, but done for C1 as well) we need a callee-saved oop
3504 // map and therefore have to make these stubs into RuntimeStubs
3505 // rather than BufferBlobs.  If the compiler needs all registers to
3506 // be preserved between the fault point and the exception handler
3507 // then it must assume responsibility for that in
3508 // AbstractCompiler::continuation_for_implicit_null_exception or
3509 // continuation_for_implicit_division_by_zero_exception. All other
3510 // implicit exceptions (e.g., NullPointerException or
3511 // AbstractMethodError on entry) are either at call sites or
3512 // otherwise assume that stack unwinding will be initiated, so
3513 // caller saved registers were assumed volatile in the compiler.
3514 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) {
3515   assert(is_throw_id(id), "expected a throw stub id");
3516 
3517   const char* name = SharedRuntime::stub_name(id);
3518 
3519   // Information about frame layout at time of blocking runtime call.
3520   // Note that we only have to preserve callee-saved registers since
3521   // the compilers are responsible for supplying a continuation point
3522   // if they expect all registers to be preserved.
3523   enum layout {
3524     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3525     rbp_off2,
3526     return_off,
3527     return_off2,
3528     framesize // inclusive of return address
3529   };
3530 
3531   int insts_size = 512;
3532   int locs_size  = 64;
3533 
3534   ResourceMark rm;
3535   const char* timer_msg = "SharedRuntime generate_throw_exception";
3536   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3537 
3538   CodeBuffer code(name, insts_size, locs_size);
3539   OopMapSet* oop_maps  = new OopMapSet();
3540   MacroAssembler* masm = new MacroAssembler(&code);
3541 
3542   address start = __ pc();
3543 
3544   // This is an inlined and slightly modified version of call_VM
3545   // which has the ability to fetch the return PC out of
3546   // thread-local storage and also sets up last_Java_sp slightly
3547   // differently than the real call_VM
3548 
3549   __ enter(); // required for proper stackwalking of RuntimeStub frame
3550 
3551   assert(is_even(framesize/2), "sp not 16-byte aligned");
3552 
3553   // return address and rbp are already in place
3554   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3555 
3556   int frame_complete = __ pc() - start;
3557 
3558   // Set up last_Java_sp and last_Java_fp
3559   address the_pc = __ pc();
3560   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3561   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3562 
3563   // Call runtime
3564   __ movptr(c_rarg0, r15_thread);
3565   BLOCK_COMMENT("call runtime_entry");
3566   __ call(RuntimeAddress(runtime_entry));
3567 
3568   // Generate oop map
3569   OopMap* map = new OopMap(framesize, 0);
3570 
3571   oop_maps->add_gc_map(the_pc - start, map);
3572 
3573   __ reset_last_Java_frame(true);
3574 
3575   __ leave(); // required for proper stackwalking of RuntimeStub frame
3576 
3577   // check for pending exceptions
3578 #ifdef ASSERT
3579   Label L;
3580   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3581   __ jcc(Assembler::notEqual, L);
3582   __ should_not_reach_here();
3583   __ bind(L);
3584 #endif // ASSERT
3585   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3586 
3587 
3588   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3589   RuntimeStub* stub =
3590     RuntimeStub::new_runtime_stub(name,
3591                                   &code,
3592                                   frame_complete,
3593                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3594                                   oop_maps, false);
3595   return stub;
3596 }
3597 
3598 //------------------------------Montgomery multiplication------------------------
3599 //
3600 
3601 #ifndef _WINDOWS
3602 
3603 // Subtract 0:b from carry:a.  Return carry.
3604 static julong
3605 sub(julong a[], julong b[], julong carry, long len) {
3606   long long i = 0, cnt = len;
3607   julong tmp;
3608   asm volatile("clc; "
3609                "0: ; "
3610                "mov (%[b], %[i], 8), %[tmp]; "
3611                "sbb %[tmp], (%[a], %[i], 8); "
3612                "inc %[i]; dec %[cnt]; "
3613                "jne 0b; "
3614                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3615                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3616                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3617                : "memory");
3618   return tmp;
3619 }
3620 
3621 // Multiply (unsigned) Long A by Long B, accumulating the double-
3622 // length result into the accumulator formed of T0, T1, and T2.
3623 #define MACC(A, B, T0, T1, T2)                                  \
3624 do {                                                            \
3625   unsigned long hi, lo;                                         \
3626   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3627            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3628            : "r"(A), "a"(B) : "cc");                            \
3629  } while(0)
3630 
3631 // As above, but add twice the double-length result into the
3632 // accumulator.
3633 #define MACC2(A, B, T0, T1, T2)                                 \
3634 do {                                                            \
3635   unsigned long hi, lo;                                         \
3636   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3637            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3638            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3639            : "r"(A), "a"(B) : "cc");                            \
3640  } while(0)
3641 
3642 #else //_WINDOWS
3643 
3644 static julong
3645 sub(julong a[], julong b[], julong carry, long len) {
3646   long i;
3647   julong tmp;
3648   unsigned char c = 1;
3649   for (i = 0; i < len; i++) {
3650     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3651     a[i] = tmp;
3652   }
3653   c = _addcarry_u64(c, carry, ~0, &tmp);
3654   return tmp;
3655 }
3656 
3657 // Multiply (unsigned) Long A by Long B, accumulating the double-
3658 // length result into the accumulator formed of T0, T1, and T2.
3659 #define MACC(A, B, T0, T1, T2)                          \
3660 do {                                                    \
3661   julong hi, lo;                            \
3662   lo = _umul128(A, B, &hi);                             \
3663   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3664   c = _addcarry_u64(c, hi, T1, &T1);                    \
3665   _addcarry_u64(c, T2, 0, &T2);                         \
3666  } while(0)
3667 
3668 // As above, but add twice the double-length result into the
3669 // accumulator.
3670 #define MACC2(A, B, T0, T1, T2)                         \
3671 do {                                                    \
3672   julong hi, lo;                            \
3673   lo = _umul128(A, B, &hi);                             \
3674   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3675   c = _addcarry_u64(c, hi, T1, &T1);                    \
3676   _addcarry_u64(c, T2, 0, &T2);                         \
3677   c = _addcarry_u64(0, lo, T0, &T0);                    \
3678   c = _addcarry_u64(c, hi, T1, &T1);                    \
3679   _addcarry_u64(c, T2, 0, &T2);                         \
3680  } while(0)
3681 
3682 #endif //_WINDOWS
3683 
3684 // Fast Montgomery multiplication.  The derivation of the algorithm is
3685 // in  A Cryptographic Library for the Motorola DSP56000,
3686 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3687 
3688 static void NOINLINE
3689 montgomery_multiply(julong a[], julong b[], julong n[],
3690                     julong m[], julong inv, int len) {
3691   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3692   int i;
3693 
3694   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3695 
3696   for (i = 0; i < len; i++) {
3697     int j;
3698     for (j = 0; j < i; j++) {
3699       MACC(a[j], b[i-j], t0, t1, t2);
3700       MACC(m[j], n[i-j], t0, t1, t2);
3701     }
3702     MACC(a[i], b[0], t0, t1, t2);
3703     m[i] = t0 * inv;
3704     MACC(m[i], n[0], t0, t1, t2);
3705 
3706     assert(t0 == 0, "broken Montgomery multiply");
3707 
3708     t0 = t1; t1 = t2; t2 = 0;
3709   }
3710 
3711   for (i = len; i < 2*len; i++) {
3712     int j;
3713     for (j = i-len+1; j < len; j++) {
3714       MACC(a[j], b[i-j], t0, t1, t2);
3715       MACC(m[j], n[i-j], t0, t1, t2);
3716     }
3717     m[i-len] = t0;
3718     t0 = t1; t1 = t2; t2 = 0;
3719   }
3720 
3721   while (t0)
3722     t0 = sub(m, n, t0, len);
3723 }
3724 
3725 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3726 // multiplies so it should be up to 25% faster than Montgomery
3727 // multiplication.  However, its loop control is more complex and it
3728 // may actually run slower on some machines.
3729 
3730 static void NOINLINE
3731 montgomery_square(julong a[], julong n[],
3732                   julong m[], julong inv, int len) {
3733   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3734   int i;
3735 
3736   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3737 
3738   for (i = 0; i < len; i++) {
3739     int j;
3740     int end = (i+1)/2;
3741     for (j = 0; j < end; j++) {
3742       MACC2(a[j], a[i-j], t0, t1, t2);
3743       MACC(m[j], n[i-j], t0, t1, t2);
3744     }
3745     if ((i & 1) == 0) {
3746       MACC(a[j], a[j], t0, t1, t2);
3747     }
3748     for (; j < i; j++) {
3749       MACC(m[j], n[i-j], t0, t1, t2);
3750     }
3751     m[i] = t0 * inv;
3752     MACC(m[i], n[0], t0, t1, t2);
3753 
3754     assert(t0 == 0, "broken Montgomery square");
3755 
3756     t0 = t1; t1 = t2; t2 = 0;
3757   }
3758 
3759   for (i = len; i < 2*len; i++) {
3760     int start = i-len+1;
3761     int end = start + (len - start)/2;
3762     int j;
3763     for (j = start; j < end; j++) {
3764       MACC2(a[j], a[i-j], t0, t1, t2);
3765       MACC(m[j], n[i-j], t0, t1, t2);
3766     }
3767     if ((i & 1) == 0) {
3768       MACC(a[j], a[j], t0, t1, t2);
3769     }
3770     for (; j < len; j++) {
3771       MACC(m[j], n[i-j], t0, t1, t2);
3772     }
3773     m[i-len] = t0;
3774     t0 = t1; t1 = t2; t2 = 0;
3775   }
3776 
3777   while (t0)
3778     t0 = sub(m, n, t0, len);
3779 }
3780 
3781 // Swap words in a longword.
3782 static julong swap(julong x) {
3783   return (x << 32) | (x >> 32);
3784 }
3785 
3786 // Copy len longwords from s to d, word-swapping as we go.  The
3787 // destination array is reversed.
3788 static void reverse_words(julong *s, julong *d, int len) {
3789   d += len;
3790   while(len-- > 0) {
3791     d--;
3792     *d = swap(*s);
3793     s++;
3794   }
3795 }
3796 
3797 // The threshold at which squaring is advantageous was determined
3798 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3799 #define MONTGOMERY_SQUARING_THRESHOLD 64
3800 
3801 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3802                                         jint len, jlong inv,
3803                                         jint *m_ints) {
3804   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3805   int longwords = len/2;
3806 
3807   // Make very sure we don't use so much space that the stack might
3808   // overflow.  512 jints corresponds to an 16384-bit integer and
3809   // will use here a total of 8k bytes of stack space.
3810   int divisor = sizeof(julong) * 4;
3811   guarantee(longwords <= 8192 / divisor, "must be");
3812   int total_allocation = longwords * sizeof (julong) * 4;
3813   julong *scratch = (julong *)alloca(total_allocation);
3814 
3815   // Local scratch arrays
3816   julong
3817     *a = scratch + 0 * longwords,
3818     *b = scratch + 1 * longwords,
3819     *n = scratch + 2 * longwords,
3820     *m = scratch + 3 * longwords;
3821 
3822   reverse_words((julong *)a_ints, a, longwords);
3823   reverse_words((julong *)b_ints, b, longwords);
3824   reverse_words((julong *)n_ints, n, longwords);
3825 
3826   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3827 
3828   reverse_words(m, (julong *)m_ints, longwords);
3829 }
3830 
3831 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3832                                       jint len, jlong inv,
3833                                       jint *m_ints) {
3834   assert(len % 2 == 0, "array length in montgomery_square must be even");
3835   int longwords = len/2;
3836 
3837   // Make very sure we don't use so much space that the stack might
3838   // overflow.  512 jints corresponds to an 16384-bit integer and
3839   // will use here a total of 6k bytes of stack space.
3840   int divisor = sizeof(julong) * 3;
3841   guarantee(longwords <= (8192 / divisor), "must be");
3842   int total_allocation = longwords * sizeof (julong) * 3;
3843   julong *scratch = (julong *)alloca(total_allocation);
3844 
3845   // Local scratch arrays
3846   julong
3847     *a = scratch + 0 * longwords,
3848     *n = scratch + 1 * longwords,
3849     *m = scratch + 2 * longwords;
3850 
3851   reverse_words((julong *)a_ints, a, longwords);
3852   reverse_words((julong *)n_ints, n, longwords);
3853 
3854   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3855     ::montgomery_square(a, n, m, (julong)inv, longwords);
3856   } else {
3857     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3858   }
3859 
3860   reverse_words(m, (julong *)m_ints, longwords);
3861 }
3862 
3863 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3864   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3865   CodeBuffer buffer(buf);
3866   short buffer_locs[20];
3867   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3868                                          sizeof(buffer_locs)/sizeof(relocInfo));
3869 
3870   MacroAssembler* masm = new MacroAssembler(&buffer);
3871 
3872   const Array<SigEntry>* sig_vk = vk->extended_sig();
3873   const Array<VMRegPair>* regs = vk->return_regs();
3874 
3875   int pack_fields_jobject_off = __ offset();
3876   // Resolve pre-allocated buffer from JNI handle.
3877   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3878   __ movptr(rax, Address(r13, 0));
3879   __ resolve_jobject(rax /* value */,
3880                      r15_thread /* thread */,
3881                      r12 /* tmp */);
3882   __ movptr(Address(r13, 0), rax);
3883 
3884   int pack_fields_off = __ offset();
3885 
3886   int j = 1;
3887   for (int i = 0; i < sig_vk->length(); i++) {
3888     BasicType bt = sig_vk->at(i)._bt;
3889     if (bt == T_METADATA) {
3890       continue;
3891     }
3892     if (bt == T_VOID) {
3893       if (sig_vk->at(i-1)._bt == T_LONG ||
3894           sig_vk->at(i-1)._bt == T_DOUBLE) {
3895         j++;
3896       }
3897       continue;
3898     }
3899     int off = sig_vk->at(i)._offset;
3900     assert(off > 0, "offset in object should be positive");
3901     VMRegPair pair = regs->at(j);
3902     VMReg r_1 = pair.first();
3903     VMReg r_2 = pair.second();
3904     Address to(rax, off);
3905     if (bt == T_FLOAT) {
3906       __ movflt(to, r_1->as_XMMRegister());
3907     } else if (bt == T_DOUBLE) {
3908       __ movdbl(to, r_1->as_XMMRegister());
3909     } else {
3910       Register val = r_1->as_Register();
3911       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
3912       if (is_reference_type(bt)) {
3913         __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
3914       } else {
3915         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
3916       }
3917     }
3918     j++;
3919   }
3920   assert(j == regs->length(), "missed a field?");
3921 
3922   __ ret(0);
3923 
3924   int unpack_fields_off = __ offset();
3925 
3926   Label skip;
3927   __ testptr(rax, rax);
3928   __ jcc(Assembler::zero, skip);
3929 
3930   j = 1;
3931   for (int i = 0; i < sig_vk->length(); i++) {
3932     BasicType bt = sig_vk->at(i)._bt;
3933     if (bt == T_METADATA) {
3934       continue;
3935     }
3936     if (bt == T_VOID) {
3937       if (sig_vk->at(i-1)._bt == T_LONG ||
3938           sig_vk->at(i-1)._bt == T_DOUBLE) {
3939         j++;
3940       }
3941       continue;
3942     }
3943     int off = sig_vk->at(i)._offset;
3944     assert(off > 0, "offset in object should be positive");
3945     VMRegPair pair = regs->at(j);
3946     VMReg r_1 = pair.first();
3947     VMReg r_2 = pair.second();
3948     Address from(rax, off);
3949     if (bt == T_FLOAT) {
3950       __ movflt(r_1->as_XMMRegister(), from);
3951     } else if (bt == T_DOUBLE) {
3952       __ movdbl(r_1->as_XMMRegister(), from);
3953     } else if (bt == T_OBJECT || bt == T_ARRAY) {
3954       assert_different_registers(rax, r_1->as_Register());
3955       __ load_heap_oop(r_1->as_Register(), from);
3956     } else {
3957       assert(is_java_primitive(bt), "unexpected basic type");
3958       assert_different_registers(rax, r_1->as_Register());
3959       size_t size_in_bytes = type2aelembytes(bt);
3960       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
3961     }
3962     j++;
3963   }
3964   assert(j == regs->length(), "missed a field?");
3965 
3966   __ bind(skip);
3967   __ ret(0);
3968 
3969   __ flush();
3970 
3971   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
3972 }
3973 
3974 #if INCLUDE_JFR
3975 
3976 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3977 // It returns a jobject handle to the event writer.
3978 // The handle is dereferenced and the return value is the event writer oop.
3979 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3980   enum layout {
3981     rbp_off,
3982     rbpH_off,
3983     return_off,
3984     return_off2,
3985     framesize // inclusive of return address
3986   };
3987 
3988   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id);
3989   CodeBuffer code(name, 1024, 64);
3990   MacroAssembler* masm = new MacroAssembler(&code);
3991   address start = __ pc();
3992 
3993   __ enter();
3994   address the_pc = __ pc();
3995 
3996   int frame_complete = the_pc - start;
3997 
3998   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3999   __ movptr(c_rarg0, r15_thread);
4000   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
4001   __ reset_last_Java_frame(true);
4002 
4003   // rax is jobject handle result, unpack and process it through a barrier.
4004   __ resolve_global_jobject(rax, r15_thread, c_rarg0);
4005 
4006   __ leave();
4007   __ ret(0);
4008 
4009   OopMapSet* oop_maps = new OopMapSet();
4010   OopMap* map = new OopMap(framesize, 1);
4011   oop_maps->add_gc_map(frame_complete, map);
4012 
4013   RuntimeStub* stub =
4014     RuntimeStub::new_runtime_stub(name,
4015                                   &code,
4016                                   frame_complete,
4017                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4018                                   oop_maps,
4019                                   false);
4020   return stub;
4021 }
4022 
4023 // For c2: call to return a leased buffer.
4024 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
4025   enum layout {
4026     rbp_off,
4027     rbpH_off,
4028     return_off,
4029     return_off2,
4030     framesize // inclusive of return address
4031   };
4032 
4033   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id);
4034   CodeBuffer code(name, 1024, 64);
4035   MacroAssembler* masm = new MacroAssembler(&code);
4036   address start = __ pc();
4037 
4038   __ enter();
4039   address the_pc = __ pc();
4040 
4041   int frame_complete = the_pc - start;
4042 
4043   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
4044   __ movptr(c_rarg0, r15_thread);
4045   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
4046   __ reset_last_Java_frame(true);
4047 
4048   __ leave();
4049   __ ret(0);
4050 
4051   OopMapSet* oop_maps = new OopMapSet();
4052   OopMap* map = new OopMap(framesize, 1);
4053   oop_maps->add_gc_map(frame_complete, map);
4054 
4055   RuntimeStub* stub =
4056     RuntimeStub::new_runtime_stub(name,
4057                                   &code,
4058                                   frame_complete,
4059                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4060                                   oop_maps,
4061                                   false);
4062   return stub;
4063 }
4064 
4065 #endif // INCLUDE_JFR