1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "classfile/symbolTable.hpp"
  32 #include "code/compiledIC.hpp"
  33 #include "code/debugInfoRec.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "oops/method.inline.hpp"
  47 #include "prims/methodHandles.hpp"
  48 #include "runtime/continuation.hpp"
  49 #include "runtime/continuationEntry.inline.hpp"
  50 #include "runtime/globals.hpp"
  51 #include "runtime/jniHandles.hpp"
  52 #include "runtime/safepointMechanism.hpp"
  53 #include "runtime/sharedRuntime.hpp"
  54 #include "runtime/signature.hpp"
  55 #include "runtime/stubRoutines.hpp"
  56 #include "runtime/timerTrace.hpp"
  57 #include "runtime/vframeArray.hpp"
  58 #include "runtime/vm_version.hpp"
  59 #include "utilities/align.hpp"
  60 #include "utilities/checkedCast.hpp"
  61 #include "utilities/formatBuffer.hpp"
  62 #include "vmreg_x86.inline.hpp"
  63 #ifdef COMPILER1
  64 #include "c1/c1_Runtime1.hpp"
  65 #endif
  66 #ifdef COMPILER2
  67 #include "opto/runtime.hpp"
  68 #endif
  69 #if INCLUDE_JVMCI
  70 #include "jvmci/jvmciJavaClasses.hpp"
  71 #endif
  72 
  73 #define __ masm->
  74 
  75 #ifdef PRODUCT
  76 #define BLOCK_COMMENT(str) /* nothing */
  77 #else
  78 #define BLOCK_COMMENT(str) __ block_comment(str)
  79 #endif // PRODUCT
  80 
  81 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  82 
  83 class RegisterSaver {
  84   // Capture info about frame layout.  Layout offsets are in jint
  85   // units because compiler frame slots are jints.
  86 #define XSAVE_AREA_BEGIN 160
  87 #define XSAVE_AREA_YMM_BEGIN 576
  88 #define XSAVE_AREA_EGPRS 960
  89 #define XSAVE_AREA_OPMASK_BEGIN 1088
  90 #define XSAVE_AREA_ZMM_BEGIN 1152
  91 #define XSAVE_AREA_UPPERBANK 1664
  92 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  93 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  94 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  95 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  96 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  97   enum layout {
  98     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  99     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
 100     DEF_XMM_OFFS(0),
 101     DEF_XMM_OFFS(1),
 102     // 2..15 are implied in range usage
 103     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 104     DEF_YMM_OFFS(0),
 105     DEF_YMM_OFFS(1),
 106     // 2..15 are implied in range usage
 107     r31_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 108     r31H_off,
 109     r30_off, r30H_off,
 110     r29_off, r29H_off,
 111     r28_off, r28H_off,
 112     r27_off, r27H_off,
 113     r26_off, r26H_off,
 114     r25_off, r25H_off,
 115     r24_off, r24H_off,
 116     r23_off, r23H_off,
 117     r22_off, r22H_off,
 118     r21_off, r21H_off,
 119     r20_off, r20H_off,
 120     r19_off, r19H_off,
 121     r18_off, r18H_off,
 122     r17_off, r17H_off,
 123     r16_off, r16H_off,
 124     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 125     DEF_OPMASK_OFFS(0),
 126     DEF_OPMASK_OFFS(1),
 127     // 2..7 are implied in range usage
 128     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 129     DEF_ZMM_OFFS(0),
 130     DEF_ZMM_OFFS(1),
 131     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 132     DEF_ZMM_UPPER_OFFS(16),
 133     DEF_ZMM_UPPER_OFFS(17),
 134     // 18..31 are implied in range usage
 135     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 136     fpu_stateH_end,
 137     r15_off, r15H_off,
 138     r14_off, r14H_off,
 139     r13_off, r13H_off,
 140     r12_off, r12H_off,
 141     r11_off, r11H_off,
 142     r10_off, r10H_off,
 143     r9_off,  r9H_off,
 144     r8_off,  r8H_off,
 145     rdi_off, rdiH_off,
 146     rsi_off, rsiH_off,
 147     ignore_off, ignoreH_off,  // extra copy of rbp
 148     rsp_off, rspH_off,
 149     rbx_off, rbxH_off,
 150     rdx_off, rdxH_off,
 151     rcx_off, rcxH_off,
 152     rax_off, raxH_off,
 153     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 154     align_off, alignH_off,
 155     flags_off, flagsH_off,
 156     // The frame sender code expects that rbp will be in the "natural" place and
 157     // will override any oopMap setting for it. We must therefore force the layout
 158     // so that it agrees with the frame sender code.
 159     rbp_off, rbpH_off,        // copy of rbp we will restore
 160     return_off, returnH_off,  // slot for return address
 161     reg_save_size             // size in compiler stack slots
 162   };
 163 
 164  public:
 165   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 166   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 167 
 168   // Offsets into the register save area
 169   // Used by deoptimization when it is managing result register
 170   // values on its own
 171 
 172   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 173   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 174   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 175   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 176   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 177 
 178   // During deoptimization only the result registers need to be restored,
 179   // all the other values have already been extracted.
 180   static void restore_result_registers(MacroAssembler* masm);
 181 };
 182 
 183 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 184   int off = 0;
 185   int num_xmm_regs = XMMRegister::available_xmm_registers();
 186 #if COMPILER2_OR_JVMCI
 187   if (save_wide_vectors && UseAVX == 0) {
 188     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 189   }
 190   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 191 #else
 192   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 193 #endif
 194 
 195   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 196   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 197   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 198   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 199   // CodeBlob frame size is in words.
 200   int frame_size_in_words = frame_size_in_bytes / wordSize;
 201   *total_frame_words = frame_size_in_words;
 202 
 203   // Save registers, fpu state, and flags.
 204   // We assume caller has already pushed the return address onto the
 205   // stack, so rsp is 8-byte aligned here.
 206   // We push rpb twice in this sequence because we want the real rbp
 207   // to be under the return like a normal enter.
 208 
 209   __ enter();          // rsp becomes 16-byte aligned here
 210   __ pushf();
 211   // Make sure rsp stays 16-byte aligned
 212   __ subq(rsp, 8);
 213   // Push CPU state in multiple of 16 bytes
 214   __ save_legacy_gprs();
 215   __ push_FPU_state();
 216 
 217 
 218   // push cpu state handles this on EVEX enabled targets
 219   if (save_wide_vectors) {
 220     // Save upper half of YMM registers(0..15)
 221     int base_addr = XSAVE_AREA_YMM_BEGIN;
 222     for (int n = 0; n < 16; n++) {
 223       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 224     }
 225     if (VM_Version::supports_evex()) {
 226       // Save upper half of ZMM registers(0..15)
 227       base_addr = XSAVE_AREA_ZMM_BEGIN;
 228       for (int n = 0; n < 16; n++) {
 229         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 230       }
 231       // Save full ZMM registers(16..num_xmm_regs)
 232       base_addr = XSAVE_AREA_UPPERBANK;
 233       off = 0;
 234       int vector_len = Assembler::AVX_512bit;
 235       for (int n = 16; n < num_xmm_regs; n++) {
 236         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 237       }
 238 #if COMPILER2_OR_JVMCI
 239       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 240       off = 0;
 241       for(int n = 0; n < KRegister::number_of_registers; n++) {
 242         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 243       }
 244 #endif
 245     }
 246   } else {
 247     if (VM_Version::supports_evex()) {
 248       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 249       int base_addr = XSAVE_AREA_UPPERBANK;
 250       off = 0;
 251       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 252       for (int n = 16; n < num_xmm_regs; n++) {
 253         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 254       }
 255 #if COMPILER2_OR_JVMCI
 256       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 257       off = 0;
 258       for(int n = 0; n < KRegister::number_of_registers; n++) {
 259         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 260       }
 261 #endif
 262     }
 263   }
 264 
 265 #if COMPILER2_OR_JVMCI
 266   if (UseAPX) {
 267       int base_addr = XSAVE_AREA_EGPRS;
 268       off = 0;
 269       for(int n = 16; n < Register::number_of_registers; n++) {
 270         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 271       }
 272   }
 273 #endif
 274 
 275   __ vzeroupper();
 276   if (frame::arg_reg_save_area_bytes != 0) {
 277     // Allocate argument register save area
 278     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 279   }
 280 
 281   // Set an oopmap for the call site.  This oopmap will map all
 282   // oop-registers and debug-info registers as callee-saved.  This
 283   // will allow deoptimization at this safepoint to find all possible
 284   // debug-info recordings, as well as let GC find all oops.
 285 
 286   OopMapSet *oop_maps = new OopMapSet();
 287   OopMap* map = new OopMap(frame_size_in_slots, 0);
 288 
 289 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 290 
 291   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 294   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 295   // rbp location is known implicitly by the frame sender code, needs no oopmap
 296   // and the location where rbp was saved by is ignored
 297   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 306   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 307 
 308   if (UseAPX) {
 309     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 324     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 325   }
 326   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 327   // on EVEX enabled targets, we get it included in the xsave area
 328   off = xmm0_off;
 329   int delta = xmm1_off - off;
 330   for (int n = 0; n < 16; n++) {
 331     XMMRegister xmm_name = as_XMMRegister(n);
 332     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 333     off += delta;
 334   }
 335   if (UseAVX > 2) {
 336     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 337     off = zmm16_off;
 338     delta = zmm17_off - off;
 339     for (int n = 16; n < num_xmm_regs; n++) {
 340       XMMRegister zmm_name = as_XMMRegister(n);
 341       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 342       off += delta;
 343     }
 344   }
 345 
 346 #if COMPILER2_OR_JVMCI
 347   if (save_wide_vectors) {
 348     // Save upper half of YMM registers(0..15)
 349     off = ymm0_off;
 350     delta = ymm1_off - ymm0_off;
 351     for (int n = 0; n < 16; n++) {
 352       XMMRegister ymm_name = as_XMMRegister(n);
 353       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 354       off += delta;
 355     }
 356     if (VM_Version::supports_evex()) {
 357       // Save upper half of ZMM registers(0..15)
 358       off = zmm0_off;
 359       delta = zmm1_off - zmm0_off;
 360       for (int n = 0; n < 16; n++) {
 361         XMMRegister zmm_name = as_XMMRegister(n);
 362         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 363         off += delta;
 364       }
 365     }
 366   }
 367 #endif // COMPILER2_OR_JVMCI
 368 
 369   // %%% These should all be a waste but we'll keep things as they were for now
 370   if (true) {
 371     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 374     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 375     // rbp location is known implicitly by the frame sender code, needs no oopmap
 376     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 385     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 386     if (UseAPX) {
 387       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 402       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 403     }
 404     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 405     // on EVEX enabled targets, we get it included in the xsave area
 406     off = xmm0H_off;
 407     delta = xmm1H_off - off;
 408     for (int n = 0; n < 16; n++) {
 409       XMMRegister xmm_name = as_XMMRegister(n);
 410       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 411       off += delta;
 412     }
 413     if (UseAVX > 2) {
 414       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 415       off = zmm16H_off;
 416       delta = zmm17H_off - off;
 417       for (int n = 16; n < num_xmm_regs; n++) {
 418         XMMRegister zmm_name = as_XMMRegister(n);
 419         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 420         off += delta;
 421       }
 422     }
 423   }
 424 
 425   return map;
 426 }
 427 
 428 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 429   int num_xmm_regs = XMMRegister::available_xmm_registers();
 430   if (frame::arg_reg_save_area_bytes != 0) {
 431     // Pop arg register save area
 432     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 433   }
 434 
 435 #if COMPILER2_OR_JVMCI
 436   if (restore_wide_vectors) {
 437     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 438     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 439   }
 440 #else
 441   assert(!restore_wide_vectors, "vectors are generated only by C2");
 442 #endif
 443 
 444   __ vzeroupper();
 445 
 446   // On EVEX enabled targets everything is handled in pop fpu state
 447   if (restore_wide_vectors) {
 448     // Restore upper half of YMM registers (0..15)
 449     int base_addr = XSAVE_AREA_YMM_BEGIN;
 450     for (int n = 0; n < 16; n++) {
 451       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 452     }
 453     if (VM_Version::supports_evex()) {
 454       // Restore upper half of ZMM registers (0..15)
 455       base_addr = XSAVE_AREA_ZMM_BEGIN;
 456       for (int n = 0; n < 16; n++) {
 457         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 458       }
 459       // Restore full ZMM registers(16..num_xmm_regs)
 460       base_addr = XSAVE_AREA_UPPERBANK;
 461       int vector_len = Assembler::AVX_512bit;
 462       int off = 0;
 463       for (int n = 16; n < num_xmm_regs; n++) {
 464         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 465       }
 466 #if COMPILER2_OR_JVMCI
 467       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 468       off = 0;
 469       for (int n = 0; n < KRegister::number_of_registers; n++) {
 470         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 471       }
 472 #endif
 473     }
 474   } else {
 475     if (VM_Version::supports_evex()) {
 476       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 477       int base_addr = XSAVE_AREA_UPPERBANK;
 478       int off = 0;
 479       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 480       for (int n = 16; n < num_xmm_regs; n++) {
 481         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 482       }
 483 #if COMPILER2_OR_JVMCI
 484       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 485       off = 0;
 486       for (int n = 0; n < KRegister::number_of_registers; n++) {
 487         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 488       }
 489 #endif
 490     }
 491   }
 492 
 493 #if COMPILER2_OR_JVMCI
 494   if (UseAPX) {
 495     int base_addr = XSAVE_AREA_EGPRS;
 496     int off = 0;
 497     for (int n = 16; n < Register::number_of_registers; n++) {
 498       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 499     }
 500   }
 501 #endif
 502 
 503   // Recover CPU state
 504   __ pop_FPU_state();
 505   __ restore_legacy_gprs();
 506   __ addq(rsp, 8);
 507   __ popf();
 508   // Get the rbp described implicitly by the calling convention (no oopMap)
 509   __ pop(rbp);
 510 }
 511 
 512 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 513 
 514   // Just restore result register. Only used by deoptimization. By
 515   // now any callee save register that needs to be restored to a c2
 516   // caller of the deoptee has been extracted into the vframeArray
 517   // and will be stuffed into the c2i adapter we create for later
 518   // restoration so only result registers need to be restored here.
 519 
 520   // Restore fp result register
 521   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 522   // Restore integer result register
 523   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 524   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 525 
 526   // Pop all of the register save are off the stack except the return address
 527   __ addptr(rsp, return_offset_in_bytes());
 528 }
 529 
 530 // Is vector's size (in bytes) bigger than a size saved by default?
 531 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 532 bool SharedRuntime::is_wide_vector(int size) {
 533   return size > 16;
 534 }
 535 
 536 // ---------------------------------------------------------------------------
 537 // Read the array of BasicTypes from a signature, and compute where the
 538 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 539 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 540 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 541 // as framesizes are fixed.
 542 // VMRegImpl::stack0 refers to the first slot 0(sp).
 543 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 544 // Register up to Register::number_of_registers are the 64-bit
 545 // integer registers.
 546 
 547 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 548 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 549 // units regardless of build. Of course for i486 there is no 64 bit build
 550 
 551 // The Java calling convention is a "shifted" version of the C ABI.
 552 // By skipping the first C ABI register we can call non-static jni methods
 553 // with small numbers of arguments without having to shuffle the arguments
 554 // at all. Since we control the java ABI we ought to at least get some
 555 // advantage out of it.
 556 
 557 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 558                                            VMRegPair *regs,
 559                                            int total_args_passed) {
 560 
 561   // Create the mapping between argument positions and
 562   // registers.
 563   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 564     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 565   };
 566   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 567     j_farg0, j_farg1, j_farg2, j_farg3,
 568     j_farg4, j_farg5, j_farg6, j_farg7
 569   };
 570 
 571 
 572   uint int_args = 0;
 573   uint fp_args = 0;
 574   uint stk_args = 0;
 575 
 576   for (int i = 0; i < total_args_passed; i++) {
 577     switch (sig_bt[i]) {
 578     case T_BOOLEAN:
 579     case T_CHAR:
 580     case T_BYTE:
 581     case T_SHORT:
 582     case T_INT:
 583       if (int_args < Argument::n_int_register_parameters_j) {
 584         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 585       } else {
 586         stk_args = align_up(stk_args, 2);
 587         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 588         stk_args += 1;
 589       }
 590       break;
 591     case T_VOID:
 592       // halves of T_LONG or T_DOUBLE
 593       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 594       regs[i].set_bad();
 595       break;
 596     case T_LONG:
 597       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 598       // fall through
 599     case T_OBJECT:
 600     case T_ARRAY:
 601     case T_ADDRESS:
 602       if (int_args < Argument::n_int_register_parameters_j) {
 603         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 604       } else {
 605         stk_args = align_up(stk_args, 2);
 606         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 607         stk_args += 2;
 608       }
 609       break;
 610     case T_FLOAT:
 611       if (fp_args < Argument::n_float_register_parameters_j) {
 612         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 613       } else {
 614         stk_args = align_up(stk_args, 2);
 615         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 616         stk_args += 1;
 617       }
 618       break;
 619     case T_DOUBLE:
 620       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 621       if (fp_args < Argument::n_float_register_parameters_j) {
 622         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 623       } else {
 624         stk_args = align_up(stk_args, 2);
 625         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 626         stk_args += 2;
 627       }
 628       break;
 629     default:
 630       ShouldNotReachHere();
 631       break;
 632     }
 633   }
 634 
 635   return stk_args;
 636 }
 637 
 638 // Same as java_calling_convention() but for multiple return
 639 // values. There's no way to store them on the stack so if we don't
 640 // have enough registers, multiple values can't be returned.
 641 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 642 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 643 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 644                                           VMRegPair *regs,
 645                                           int total_args_passed) {
 646   // Create the mapping between argument positions and
 647   // registers.
 648   static const Register INT_ArgReg[java_return_convention_max_int] = {
 649     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 650   };
 651   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 652     j_farg0, j_farg1, j_farg2, j_farg3,
 653     j_farg4, j_farg5, j_farg6, j_farg7
 654   };
 655 
 656 
 657   uint int_args = 0;
 658   uint fp_args = 0;
 659 
 660   for (int i = 0; i < total_args_passed; i++) {
 661     switch (sig_bt[i]) {
 662     case T_BOOLEAN:
 663     case T_CHAR:
 664     case T_BYTE:
 665     case T_SHORT:
 666     case T_INT:
 667       if (int_args < Argument::n_int_register_parameters_j+1) {
 668         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 669         int_args++;
 670       } else {
 671         return -1;
 672       }
 673       break;
 674     case T_VOID:
 675       // halves of T_LONG or T_DOUBLE
 676       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 677       regs[i].set_bad();
 678       break;
 679     case T_LONG:
 680       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 681       // fall through
 682     case T_OBJECT:
 683     case T_ARRAY:
 684     case T_ADDRESS:
 685     case T_METADATA:
 686       if (int_args < Argument::n_int_register_parameters_j+1) {
 687         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 688         int_args++;
 689       } else {
 690         return -1;
 691       }
 692       break;
 693     case T_FLOAT:
 694       if (fp_args < Argument::n_float_register_parameters_j) {
 695         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 696         fp_args++;
 697       } else {
 698         return -1;
 699       }
 700       break;
 701     case T_DOUBLE:
 702       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 703       if (fp_args < Argument::n_float_register_parameters_j) {
 704         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 705         fp_args++;
 706       } else {
 707         return -1;
 708       }
 709       break;
 710     default:
 711       ShouldNotReachHere();
 712       break;
 713     }
 714   }
 715 
 716   return int_args + fp_args;
 717 }
 718 
 719 // Patch the callers callsite with entry to compiled code if it exists.
 720 static void patch_callers_callsite(MacroAssembler *masm) {
 721   Label L;
 722   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 723   __ jcc(Assembler::equal, L);
 724 
 725   // Save the current stack pointer
 726   __ mov(r13, rsp);
 727   // Schedule the branch target address early.
 728   // Call into the VM to patch the caller, then jump to compiled callee
 729   // rax isn't live so capture return address while we easily can
 730   __ movptr(rax, Address(rsp, 0));
 731 
 732   // align stack so push_CPU_state doesn't fault
 733   __ andptr(rsp, -(StackAlignmentInBytes));
 734   __ push_CPU_state();
 735   __ vzeroupper();
 736   // VM needs caller's callsite
 737   // VM needs target method
 738   // This needs to be a long call since we will relocate this adapter to
 739   // the codeBuffer and it may not reach
 740 
 741   // Allocate argument register save area
 742   if (frame::arg_reg_save_area_bytes != 0) {
 743     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 744   }
 745   __ mov(c_rarg0, rbx);
 746   __ mov(c_rarg1, rax);
 747   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 748 
 749   // De-allocate argument register save area
 750   if (frame::arg_reg_save_area_bytes != 0) {
 751     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 752   }
 753 
 754   __ vzeroupper();
 755   __ pop_CPU_state();
 756   // restore sp
 757   __ mov(rsp, r13);
 758   __ bind(L);
 759 }
 760 
 761 // For each inline type argument, sig includes the list of fields of
 762 // the inline type. This utility function computes the number of
 763 // arguments for the call if inline types are passed by reference (the
 764 // calling convention the interpreter expects).
 765 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 766   int total_args_passed = 0;
 767   if (InlineTypePassFieldsAsArgs) {
 768     for (int i = 0; i < sig_extended->length(); i++) {
 769       BasicType bt = sig_extended->at(i)._bt;
 770       if (bt == T_METADATA) {
 771         // In sig_extended, an inline type argument starts with:
 772         // T_METADATA, followed by the types of the fields of the
 773         // inline type and T_VOID to mark the end of the value
 774         // type. Inline types are flattened so, for instance, in the
 775         // case of an inline type with an int field and an inline type
 776         // field that itself has 2 fields, an int and a long:
 777         // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
 778         // slot for the T_LONG) T_VOID (inner inline type) T_VOID
 779         // (outer inline type)
 780         total_args_passed++;
 781         int vt = 1;
 782         do {
 783           i++;
 784           BasicType bt = sig_extended->at(i)._bt;
 785           BasicType prev_bt = sig_extended->at(i-1)._bt;
 786           if (bt == T_METADATA) {
 787             vt++;
 788           } else if (bt == T_VOID &&
 789                      prev_bt != T_LONG &&
 790                      prev_bt != T_DOUBLE) {
 791             vt--;
 792           }
 793         } while (vt != 0);
 794       } else {
 795         total_args_passed++;
 796       }
 797     }
 798   } else {
 799     total_args_passed = sig_extended->length();
 800   }
 801   return total_args_passed;
 802 }
 803 
 804 
 805 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 806                                    BasicType bt,
 807                                    BasicType prev_bt,
 808                                    size_t size_in_bytes,
 809                                    const VMRegPair& reg_pair,
 810                                    const Address& to,
 811                                    int extraspace,
 812                                    bool is_oop) {
 813   if (bt == T_VOID) {
 814     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 815     return;
 816   }
 817 
 818   // Say 4 args:
 819   // i   st_off
 820   // 0   32 T_LONG
 821   // 1   24 T_VOID
 822   // 2   16 T_OBJECT
 823   // 3    8 T_BOOL
 824   // -    0 return address
 825   //
 826   // However to make thing extra confusing. Because we can fit a long/double in
 827   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 828   // leaves one slot empty and only stores to a single slot. In this case the
 829   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 830 
 831   bool wide = (size_in_bytes == wordSize);
 832   VMReg r_1 = reg_pair.first();
 833   VMReg r_2 = reg_pair.second();
 834   assert(r_2->is_valid() == wide, "invalid size");
 835   if (!r_1->is_valid()) {
 836     assert(!r_2->is_valid(), "must be invalid");
 837     return;
 838   }
 839 
 840   if (!r_1->is_XMMRegister()) {
 841     Register val = rax;
 842     if (r_1->is_stack()) {
 843       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 844       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 845     } else {
 846       val = r_1->as_Register();
 847     }
 848     assert_different_registers(to.base(), val, rscratch1);
 849     if (is_oop) {
 850       __ push(r13);
 851       __ push(rbx);
 852       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 853       __ pop(rbx);
 854       __ pop(r13);
 855     } else {
 856       __ store_sized_value(to, val, size_in_bytes);
 857     }
 858   } else {
 859     if (wide) {
 860       __ movdbl(to, r_1->as_XMMRegister());
 861     } else {
 862       __ movflt(to, r_1->as_XMMRegister());
 863     }
 864   }
 865 }
 866 
 867 static void gen_c2i_adapter(MacroAssembler *masm,
 868                             const GrowableArray<SigEntry>* sig_extended,
 869                             const VMRegPair *regs,
 870                             bool requires_clinit_barrier,
 871                             address& c2i_no_clinit_check_entry,
 872                             Label& skip_fixup,
 873                             address start,
 874                             OopMapSet* oop_maps,
 875                             int& frame_complete,
 876                             int& frame_size_in_words,
 877                             bool alloc_inline_receiver) {
 878   if (requires_clinit_barrier && VM_Version::supports_fast_class_init_checks()) {
 879     Label L_skip_barrier;
 880     Register method = rbx;
 881 
 882     { // Bypass the barrier for non-static methods
 883       Register flags = rscratch1;
 884       __ movl(flags, Address(method, Method::access_flags_offset()));
 885       __ testl(flags, JVM_ACC_STATIC);
 886       __ jcc(Assembler::zero, L_skip_barrier); // non-static
 887     }
 888 
 889     Register klass = rscratch1;
 890     __ load_method_holder(klass, method);
 891     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
 892 
 893     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
 894 
 895     __ bind(L_skip_barrier);
 896     c2i_no_clinit_check_entry = __ pc();
 897   }
 898 
 899   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 900   bs->c2i_entry_barrier(masm);
 901 
 902   // Before we get into the guts of the C2I adapter, see if we should be here
 903   // at all.  We've come from compiled code and are attempting to jump to the
 904   // interpreter, which means the caller made a static call to get here
 905   // (vcalls always get a compiled target if there is one).  Check for a
 906   // compiled target.  If there is one, we need to patch the caller's call.
 907   patch_callers_callsite(masm);
 908 
 909   __ bind(skip_fixup);
 910 
 911   if (InlineTypePassFieldsAsArgs) {
 912     // Is there an inline type argument?
 913     bool has_inline_argument = false;
 914     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 915       has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
 916     }
 917     if (has_inline_argument) {
 918       // There is at least an inline type argument: we're coming from
 919       // compiled code so we have no buffers to back the inline types.
 920       // Allocate the buffers here with a runtime call.
 921       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
 922 
 923       frame_complete = __ offset();
 924 
 925       __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
 926 
 927       __ mov(c_rarg0, r15_thread);
 928       __ mov(c_rarg1, rbx);
 929       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 930       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 931 
 932       oop_maps->add_gc_map((int)(__ pc() - start), map);
 933       __ reset_last_Java_frame(false);
 934 
 935       RegisterSaver::restore_live_registers(masm);
 936 
 937       Label no_exception;
 938       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 939       __ jcc(Assembler::equal, no_exception);
 940 
 941       __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
 942       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 943       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 944 
 945       __ bind(no_exception);
 946 
 947       // We get an array of objects from the runtime call
 948       __ get_vm_result(rscratch2, r15_thread); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 949       __ get_vm_result_2(rbx, r15_thread); // TODO: required to keep the callee Method live?
 950     }
 951   }
 952 
 953   // Since all args are passed on the stack, total_args_passed *
 954   // Interpreter::stackElementSize is the space we need.
 955   int total_args_passed = compute_total_args_passed_int(sig_extended);
 956   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 957 
 958   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 959 
 960   // stack is aligned, keep it that way
 961   // This is not currently needed or enforced by the interpreter, but
 962   // we might as well conform to the ABI.
 963   extraspace = align_up(extraspace, 2*wordSize);
 964 
 965   // set senderSP value
 966   __ lea(r13, Address(rsp, wordSize));
 967 
 968 #ifdef ASSERT
 969   __ check_stack_alignment(r13, "sender stack not aligned");
 970 #endif
 971   if (extraspace > 0) {
 972     // Pop the return address
 973     __ pop(rax);
 974 
 975     __ subptr(rsp, extraspace);
 976 
 977     // Push the return address
 978     __ push(rax);
 979 
 980     // Account for the return address location since we store it first rather
 981     // than hold it in a register across all the shuffling
 982     extraspace += wordSize;
 983   }
 984 
 985 #ifdef ASSERT
 986   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 987 #endif
 988 
 989   // Now write the args into the outgoing interpreter space
 990 
 991   // next_arg_comp is the next argument from the compiler point of
 992   // view (inline type fields are passed in registers/on the stack). In
 993   // sig_extended, an inline type argument starts with: T_METADATA,
 994   // followed by the types of the fields of the inline type and T_VOID
 995   // to mark the end of the inline type. ignored counts the number of
 996   // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
 997   // used to get the buffer for that argument from the pool of buffers
 998   // we allocated above and want to pass to the
 999   // interpreter. next_arg_int is the next argument from the
1000   // interpreter point of view (inline types are passed by reference).
1001   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
1002        next_arg_comp < sig_extended->length(); next_arg_comp++) {
1003     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
1004     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
1005     BasicType bt = sig_extended->at(next_arg_comp)._bt;
1006     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
1007     if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
1008       int next_off = st_off - Interpreter::stackElementSize;
1009       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
1010       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
1011       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
1012       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1013                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
1014       next_arg_int++;
1015 #ifdef ASSERT
1016       if (bt == T_LONG || bt == T_DOUBLE) {
1017         // Overwrite the unused slot with known junk
1018         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
1019         __ movptr(Address(rsp, st_off), rax);
1020       }
1021 #endif /* ASSERT */
1022     } else {
1023       ignored++;
1024       // get the buffer from the just allocated pool of buffers
1025       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
1026       __ load_heap_oop(r14, Address(rscratch2, index));
1027       next_vt_arg++; next_arg_int++;
1028       int vt = 1;
1029       // write fields we get from compiled code in registers/stack
1030       // slots to the buffer: we know we are done with that inline type
1031       // argument when we hit the T_VOID that acts as an end of inline
1032       // type delimiter for this inline type. Inline types are flattened
1033       // so we might encounter embedded inline types. Each entry in
1034       // sig_extended contains a field offset in the buffer.
1035       Label L_null;
1036       do {
1037         next_arg_comp++;
1038         BasicType bt = sig_extended->at(next_arg_comp)._bt;
1039         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
1040         if (bt == T_METADATA) {
1041           vt++;
1042           ignored++;
1043         } else if (bt == T_VOID &&
1044                    prev_bt != T_LONG &&
1045                    prev_bt != T_DOUBLE) {
1046           vt--;
1047           ignored++;
1048         } else {
1049           int off = sig_extended->at(next_arg_comp)._offset;
1050           if (off == -1) {
1051             // Nullable inline type argument, emit null check
1052             VMReg reg = regs[next_arg_comp-ignored].first();
1053             Label L_notNull;
1054             if (reg->is_stack()) {
1055               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
1056               __ testb(Address(rsp, ld_off), 1);
1057             } else {
1058               __ testb(reg->as_Register(), 1);
1059             }
1060             __ jcc(Assembler::notZero, L_notNull);
1061             __ movptr(Address(rsp, st_off), 0);
1062             __ jmp(L_null);
1063             __ bind(L_notNull);
1064             continue;
1065           }
1066           assert(off > 0, "offset in object should be positive");
1067           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
1068           bool is_oop = is_reference_type(bt);
1069           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
1070                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
1071         }
1072       } while (vt != 0);
1073       // pass the buffer to the interpreter
1074       __ movptr(Address(rsp, st_off), r14);
1075       __ bind(L_null);
1076     }
1077   }
1078 
1079   // Schedule the branch target address early.
1080   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1081   __ jmp(rcx);
1082 }
1083 
1084 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
1085                         address code_start, address code_end,
1086                         Label& L_ok) {
1087   Label L_fail;
1088   __ lea(temp_reg, AddressLiteral(code_start, relocInfo::none));
1089   __ cmpptr(pc_reg, temp_reg);
1090   __ jcc(Assembler::belowEqual, L_fail);
1091   __ lea(temp_reg, AddressLiteral(code_end, relocInfo::none));
1092   __ cmpptr(pc_reg, temp_reg);
1093   __ jcc(Assembler::below, L_ok);
1094   __ bind(L_fail);
1095 }
1096 
1097 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1098                                     int comp_args_on_stack,
1099                                     const GrowableArray<SigEntry>* sig,
1100                                     const VMRegPair *regs) {
1101 
1102   // Note: r13 contains the senderSP on entry. We must preserve it since
1103   // we may do a i2c -> c2i transition if we lose a race where compiled
1104   // code goes non-entrant while we get args ready.
1105   // In addition we use r13 to locate all the interpreter args as
1106   // we must align the stack to 16 bytes on an i2c entry else we
1107   // lose alignment we expect in all compiled code and register
1108   // save code can segv when fxsave instructions find improperly
1109   // aligned stack pointer.
1110 
1111   // Adapters can be frameless because they do not require the caller
1112   // to perform additional cleanup work, such as correcting the stack pointer.
1113   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1114   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1115   // even if a callee has modified the stack pointer.
1116   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1117   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1118   // up via the senderSP register).
1119   // In other words, if *either* the caller or callee is interpreted, we can
1120   // get the stack pointer repaired after a call.
1121   // This is why c2i and i2c adapters cannot be indefinitely composed.
1122   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1123   // both caller and callee would be compiled methods, and neither would
1124   // clean up the stack pointer changes performed by the two adapters.
1125   // If this happens, control eventually transfers back to the compiled
1126   // caller, but with an uncorrected stack, causing delayed havoc.
1127 
1128   if (VerifyAdapterCalls &&
1129       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
1130     // So, let's test for cascading c2i/i2c adapters right now.
1131     //  assert(Interpreter::contains($return_addr) ||
1132     //         StubRoutines::contains($return_addr),
1133     //         "i2c adapter must return to an interpreter frame");
1134     __ block_comment("verify_i2c { ");
1135     // Pick up the return address
1136     __ movptr(rax, Address(rsp, 0));
1137     Label L_ok;
1138     if (Interpreter::code() != nullptr) {
1139       range_check(masm, rax, r11,
1140                   Interpreter::code()->code_start(),
1141                   Interpreter::code()->code_end(),
1142                   L_ok);
1143     }
1144     if (StubRoutines::initial_stubs_code() != nullptr) {
1145       range_check(masm, rax, r11,
1146                   StubRoutines::initial_stubs_code()->code_begin(),
1147                   StubRoutines::initial_stubs_code()->code_end(),
1148                   L_ok);
1149     }
1150     if (StubRoutines::final_stubs_code() != nullptr) {
1151       range_check(masm, rax, r11,
1152                   StubRoutines::final_stubs_code()->code_begin(),
1153                   StubRoutines::final_stubs_code()->code_end(),
1154                   L_ok);
1155     }
1156     const char* msg = "i2c adapter must return to an interpreter frame";
1157     __ block_comment(msg);
1158     __ stop(msg);
1159     __ bind(L_ok);
1160     __ block_comment("} verify_i2ce ");
1161   }
1162 
1163   // Must preserve original SP for loading incoming arguments because
1164   // we need to align the outgoing SP for compiled code.
1165   __ movptr(r11, rsp);
1166 
1167   // Pick up the return address
1168   __ pop(rax);
1169 
1170   // Convert 4-byte c2 stack slots to words.
1171   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1172 
1173   if (comp_args_on_stack) {
1174     __ subptr(rsp, comp_words_on_stack * wordSize);
1175   }
1176 
1177   // Ensure compiled code always sees stack at proper alignment
1178   __ andptr(rsp, -16);
1179 
1180   // push the return address and misalign the stack that youngest frame always sees
1181   // as far as the placement of the call instruction
1182   __ push(rax);
1183 
1184   // Put saved SP in another register
1185   const Register saved_sp = rax;
1186   __ movptr(saved_sp, r11);
1187 
1188   // Will jump to the compiled code just as if compiled code was doing it.
1189   // Pre-load the register-jump target early, to schedule it better.
1190   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1191 
1192 #if INCLUDE_JVMCI
1193   if (EnableJVMCI) {
1194     // check if this call should be routed towards a specific entry point
1195     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1196     Label no_alternative_target;
1197     __ jcc(Assembler::equal, no_alternative_target);
1198     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1199     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1200     __ bind(no_alternative_target);
1201   }
1202 #endif // INCLUDE_JVMCI
1203 
1204   int total_args_passed = sig->length();
1205 
1206   // Now generate the shuffle code.  Pick up all register args and move the
1207   // rest through the floating point stack top.
1208   for (int i = 0; i < total_args_passed; i++) {
1209     BasicType bt = sig->at(i)._bt;
1210     if (bt == T_VOID) {
1211       // Longs and doubles are passed in native word order, but misaligned
1212       // in the 32-bit build.
1213       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1214       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1215       continue;
1216     }
1217 
1218     // Pick up 0, 1 or 2 words from SP+offset.
1219 
1220     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1221             "scrambled load targets?");
1222     // Load in argument order going down.
1223     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1224     // Point to interpreter value (vs. tag)
1225     int next_off = ld_off - Interpreter::stackElementSize;
1226     //
1227     //
1228     //
1229     VMReg r_1 = regs[i].first();
1230     VMReg r_2 = regs[i].second();
1231     if (!r_1->is_valid()) {
1232       assert(!r_2->is_valid(), "");
1233       continue;
1234     }
1235     if (r_1->is_stack()) {
1236       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1237       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1238 
1239       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1240       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1241       // will be generated.
1242       if (!r_2->is_valid()) {
1243         // sign extend???
1244         __ movl(r13, Address(saved_sp, ld_off));
1245         __ movptr(Address(rsp, st_off), r13);
1246       } else {
1247         //
1248         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1249         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1250         // So we must adjust where to pick up the data to match the interpreter.
1251         //
1252         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1253         // are accessed as negative so LSW is at LOW address
1254 
1255         // ld_off is MSW so get LSW
1256         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1257                            next_off : ld_off;
1258         __ movq(r13, Address(saved_sp, offset));
1259         // st_off is LSW (i.e. reg.first())
1260         __ movq(Address(rsp, st_off), r13);
1261       }
1262     } else if (r_1->is_Register()) {  // Register argument
1263       Register r = r_1->as_Register();
1264       assert(r != rax, "must be different");
1265       if (r_2->is_valid()) {
1266         //
1267         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1268         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1269         // So we must adjust where to pick up the data to match the interpreter.
1270 
1271         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1272                            next_off : ld_off;
1273 
1274         // this can be a misaligned move
1275         __ movq(r, Address(saved_sp, offset));
1276       } else {
1277         // sign extend and use a full word?
1278         __ movl(r, Address(saved_sp, ld_off));
1279       }
1280     } else {
1281       if (!r_2->is_valid()) {
1282         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1283       } else {
1284         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1285       }
1286     }
1287   }
1288 
1289   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1290 
1291   // 6243940 We might end up in handle_wrong_method if
1292   // the callee is deoptimized as we race thru here. If that
1293   // happens we don't want to take a safepoint because the
1294   // caller frame will look interpreted and arguments are now
1295   // "compiled" so it is much better to make this transition
1296   // invisible to the stack walking code. Unfortunately if
1297   // we try and find the callee by normal means a safepoint
1298   // is possible. So we stash the desired callee in the thread
1299   // and the vm will find there should this case occur.
1300 
1301   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1302 
1303   // put Method* where a c2i would expect should we end up there
1304   // only needed because of c2 resolve stubs return Method* as a result in
1305   // rax
1306   __ mov(rax, rbx);
1307   __ jmp(r11);
1308 }
1309 
1310 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1311   Register data = rax;
1312   __ ic_check(1 /* end_alignment */);
1313   __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1314 
1315   // Method might have been compiled since the call site was patched to
1316   // interpreted if that is the case treat it as a miss so we can get
1317   // the call site corrected.
1318   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1319   __ jcc(Assembler::equal, skip_fixup);
1320   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1321 }
1322 
1323 // ---------------------------------------------------------------
1324 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1325                                                             int comp_args_on_stack,
1326                                                             const GrowableArray<SigEntry>* sig,
1327                                                             const VMRegPair* regs,
1328                                                             const GrowableArray<SigEntry>* sig_cc,
1329                                                             const VMRegPair* regs_cc,
1330                                                             const GrowableArray<SigEntry>* sig_cc_ro,
1331                                                             const VMRegPair* regs_cc_ro,
1332                                                             AdapterFingerPrint* fingerprint,
1333                                                             AdapterBlob*& new_adapter,
1334                                                             bool allocate_code_blob) {
1335   address i2c_entry = __ pc();
1336   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1337 
1338   // -------------------------------------------------------------------------
1339   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1340   // to the interpreter.  The args start out packed in the compiled layout.  They
1341   // need to be unpacked into the interpreter layout.  This will almost always
1342   // require some stack space.  We grow the current (compiled) stack, then repack
1343   // the args.  We  finally end in a jump to the generic interpreter entry point.
1344   // On exit from the interpreter, the interpreter will restore our SP (lest the
1345   // compiled code, which relies solely on SP and not RBP, get sick).
1346 
1347   address c2i_unverified_entry        = __ pc();
1348   address c2i_unverified_inline_entry = __ pc();
1349   Label skip_fixup;
1350 
1351   gen_inline_cache_check(masm, skip_fixup);
1352 
1353   OopMapSet* oop_maps = new OopMapSet();
1354   int frame_complete = CodeOffsets::frame_never_safe;
1355   int frame_size_in_words = 0;
1356 
1357   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1358   address c2i_no_clinit_check_entry = nullptr;
1359   address c2i_inline_ro_entry = __ pc();
1360   if (regs_cc != regs_cc_ro) {
1361     // No class init barrier needed because method is guaranteed to be non-static
1362     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, c2i_no_clinit_check_entry,
1363                     skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1364     skip_fixup.reset();
1365   }
1366 
1367   // Scalarized c2i adapter
1368   address c2i_entry        = __ pc();
1369   address c2i_inline_entry = __ pc();
1370   gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1371                   skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1372 
1373   // Non-scalarized c2i adapter
1374   if (regs != regs_cc) {
1375     c2i_unverified_inline_entry = __ pc();
1376     Label inline_entry_skip_fixup;
1377     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1378 
1379     c2i_inline_entry = __ pc();
1380     gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1381                     inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1382   }
1383 
1384   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1385   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1386   if (allocate_code_blob) {
1387     bool caller_must_gc_arguments = (regs != regs_cc);
1388     new_adapter = AdapterBlob::create(masm->code(), frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1389   }
1390 
1391   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry, c2i_unverified_inline_entry, c2i_no_clinit_check_entry);
1392 }
1393 
1394 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1395                                          VMRegPair *regs,
1396                                          int total_args_passed) {
1397 
1398 // We return the amount of VMRegImpl stack slots we need to reserve for all
1399 // the arguments NOT counting out_preserve_stack_slots.
1400 
1401 // NOTE: These arrays will have to change when c1 is ported
1402 #ifdef _WIN64
1403     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1404       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1405     };
1406     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1407       c_farg0, c_farg1, c_farg2, c_farg3
1408     };
1409 #else
1410     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1411       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1412     };
1413     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1414       c_farg0, c_farg1, c_farg2, c_farg3,
1415       c_farg4, c_farg5, c_farg6, c_farg7
1416     };
1417 #endif // _WIN64
1418 
1419 
1420     uint int_args = 0;
1421     uint fp_args = 0;
1422     uint stk_args = 0; // inc by 2 each time
1423 
1424     for (int i = 0; i < total_args_passed; i++) {
1425       switch (sig_bt[i]) {
1426       case T_BOOLEAN:
1427       case T_CHAR:
1428       case T_BYTE:
1429       case T_SHORT:
1430       case T_INT:
1431         if (int_args < Argument::n_int_register_parameters_c) {
1432           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1433 #ifdef _WIN64
1434           fp_args++;
1435           // Allocate slots for callee to stuff register args the stack.
1436           stk_args += 2;
1437 #endif
1438         } else {
1439           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1440           stk_args += 2;
1441         }
1442         break;
1443       case T_LONG:
1444         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1445         // fall through
1446       case T_OBJECT:
1447       case T_ARRAY:
1448       case T_ADDRESS:
1449       case T_METADATA:
1450         if (int_args < Argument::n_int_register_parameters_c) {
1451           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1452 #ifdef _WIN64
1453           fp_args++;
1454           stk_args += 2;
1455 #endif
1456         } else {
1457           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1458           stk_args += 2;
1459         }
1460         break;
1461       case T_FLOAT:
1462         if (fp_args < Argument::n_float_register_parameters_c) {
1463           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1464 #ifdef _WIN64
1465           int_args++;
1466           // Allocate slots for callee to stuff register args the stack.
1467           stk_args += 2;
1468 #endif
1469         } else {
1470           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1471           stk_args += 2;
1472         }
1473         break;
1474       case T_DOUBLE:
1475         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1476         if (fp_args < Argument::n_float_register_parameters_c) {
1477           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1478 #ifdef _WIN64
1479           int_args++;
1480           // Allocate slots for callee to stuff register args the stack.
1481           stk_args += 2;
1482 #endif
1483         } else {
1484           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1485           stk_args += 2;
1486         }
1487         break;
1488       case T_VOID: // Halves of longs and doubles
1489         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1490         regs[i].set_bad();
1491         break;
1492       default:
1493         ShouldNotReachHere();
1494         break;
1495       }
1496     }
1497 #ifdef _WIN64
1498   // windows abi requires that we always allocate enough stack space
1499   // for 4 64bit registers to be stored down.
1500   if (stk_args < 8) {
1501     stk_args = 8;
1502   }
1503 #endif // _WIN64
1504 
1505   return stk_args;
1506 }
1507 
1508 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1509                                              uint num_bits,
1510                                              uint total_args_passed) {
1511   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1512          "only certain vector sizes are supported for now");
1513 
1514   static const XMMRegister VEC_ArgReg[32] = {
1515      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1516      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1517     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1518     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1519   };
1520 
1521   uint stk_args = 0;
1522   uint fp_args = 0;
1523 
1524   for (uint i = 0; i < total_args_passed; i++) {
1525     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1526     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1527     regs[i].set_pair(vmreg->next(next_val), vmreg);
1528   }
1529 
1530   return stk_args;
1531 }
1532 
1533 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1534   // We always ignore the frame_slots arg and just use the space just below frame pointer
1535   // which by this time is free to use
1536   switch (ret_type) {
1537   case T_FLOAT:
1538     __ movflt(Address(rbp, -wordSize), xmm0);
1539     break;
1540   case T_DOUBLE:
1541     __ movdbl(Address(rbp, -wordSize), xmm0);
1542     break;
1543   case T_VOID:  break;
1544   default: {
1545     __ movptr(Address(rbp, -wordSize), rax);
1546     }
1547   }
1548 }
1549 
1550 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1551   // We always ignore the frame_slots arg and just use the space just below frame pointer
1552   // which by this time is free to use
1553   switch (ret_type) {
1554   case T_FLOAT:
1555     __ movflt(xmm0, Address(rbp, -wordSize));
1556     break;
1557   case T_DOUBLE:
1558     __ movdbl(xmm0, Address(rbp, -wordSize));
1559     break;
1560   case T_VOID:  break;
1561   default: {
1562     __ movptr(rax, Address(rbp, -wordSize));
1563     }
1564   }
1565 }
1566 
1567 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1568     for ( int i = first_arg ; i < arg_count ; i++ ) {
1569       if (args[i].first()->is_Register()) {
1570         __ push(args[i].first()->as_Register());
1571       } else if (args[i].first()->is_XMMRegister()) {
1572         __ subptr(rsp, 2*wordSize);
1573         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1574       }
1575     }
1576 }
1577 
1578 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1579     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1580       if (args[i].first()->is_Register()) {
1581         __ pop(args[i].first()->as_Register());
1582       } else if (args[i].first()->is_XMMRegister()) {
1583         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1584         __ addptr(rsp, 2*wordSize);
1585       }
1586     }
1587 }
1588 
1589 static void verify_oop_args(MacroAssembler* masm,
1590                             const methodHandle& method,
1591                             const BasicType* sig_bt,
1592                             const VMRegPair* regs) {
1593   Register temp_reg = rbx;  // not part of any compiled calling seq
1594   if (VerifyOops) {
1595     for (int i = 0; i < method->size_of_parameters(); i++) {
1596       if (is_reference_type(sig_bt[i])) {
1597         VMReg r = regs[i].first();
1598         assert(r->is_valid(), "bad oop arg");
1599         if (r->is_stack()) {
1600           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1601           __ verify_oop(temp_reg);
1602         } else {
1603           __ verify_oop(r->as_Register());
1604         }
1605       }
1606     }
1607   }
1608 }
1609 
1610 static void check_continuation_enter_argument(VMReg actual_vmreg,
1611                                               Register expected_reg,
1612                                               const char* name) {
1613   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1614   assert(actual_vmreg->as_Register() == expected_reg,
1615          "%s is in unexpected register: %s instead of %s",
1616          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1617 }
1618 
1619 
1620 //---------------------------- continuation_enter_setup ---------------------------
1621 //
1622 // Arguments:
1623 //   None.
1624 //
1625 // Results:
1626 //   rsp: pointer to blank ContinuationEntry
1627 //
1628 // Kills:
1629 //   rax
1630 //
1631 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1632   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1633   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1634   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1635 
1636   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1637   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1638 
1639   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1640   OopMap* map = new OopMap(frame_size, 0);
1641 
1642   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1643   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1644   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1645 
1646   return map;
1647 }
1648 
1649 //---------------------------- fill_continuation_entry ---------------------------
1650 //
1651 // Arguments:
1652 //   rsp: pointer to blank Continuation entry
1653 //   reg_cont_obj: pointer to the continuation
1654 //   reg_flags: flags
1655 //
1656 // Results:
1657 //   rsp: pointer to filled out ContinuationEntry
1658 //
1659 // Kills:
1660 //   rax
1661 //
1662 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1663   assert_different_registers(rax, reg_cont_obj, reg_flags);
1664 #ifdef ASSERT
1665   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1666 #endif
1667   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1668   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1669   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1670   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1671   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1672 
1673   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1674   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1675   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1676   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1677 
1678   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1679   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1680 }
1681 
1682 //---------------------------- continuation_enter_cleanup ---------------------------
1683 //
1684 // Arguments:
1685 //   rsp: pointer to the ContinuationEntry
1686 //
1687 // Results:
1688 //   rsp: pointer to the spilled rbp in the entry frame
1689 //
1690 // Kills:
1691 //   rbx
1692 //
1693 void static continuation_enter_cleanup(MacroAssembler* masm) {
1694 #ifdef ASSERT
1695   Label L_good_sp;
1696   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1697   __ jcc(Assembler::equal, L_good_sp);
1698   __ stop("Incorrect rsp at continuation_enter_cleanup");
1699   __ bind(L_good_sp);
1700 #endif
1701   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1702   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1703 
1704   if (CheckJNICalls) {
1705     // Check if this is a virtual thread continuation
1706     Label L_skip_vthread_code;
1707     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1708     __ jcc(Assembler::equal, L_skip_vthread_code);
1709 
1710     // If the held monitor count is > 0 and this vthread is terminating then
1711     // it failed to release a JNI monitor. So we issue the same log message
1712     // that JavaThread::exit does.
1713     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1714     __ jcc(Assembler::equal, L_skip_vthread_code);
1715 
1716     // rax may hold an exception oop, save it before the call
1717     __ push(rax);
1718     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1719     __ pop(rax);
1720 
1721     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1722     // on termination. The held count is implicitly zeroed below when we restore from
1723     // the parent held count (which has to be zero).
1724     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1725 
1726     __ bind(L_skip_vthread_code);
1727   }
1728 #ifdef ASSERT
1729   else {
1730     // Check if this is a virtual thread continuation
1731     Label L_skip_vthread_code;
1732     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1733     __ jcc(Assembler::equal, L_skip_vthread_code);
1734 
1735     // See comment just above. If not checking JNI calls the JNI count is only
1736     // needed for assertion checking.
1737     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1738 
1739     __ bind(L_skip_vthread_code);
1740   }
1741 #endif
1742 
1743   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1744   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1745 
1746   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1747   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1748   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1749 }
1750 
1751 static void gen_continuation_enter(MacroAssembler* masm,
1752                                    const VMRegPair* regs,
1753                                    int& exception_offset,
1754                                    OopMapSet* oop_maps,
1755                                    int& frame_complete,
1756                                    int& stack_slots,
1757                                    int& interpreted_entry_offset,
1758                                    int& compiled_entry_offset) {
1759 
1760   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1761   int pos_cont_obj   = 0;
1762   int pos_is_cont    = 1;
1763   int pos_is_virtual = 2;
1764 
1765   // The platform-specific calling convention may present the arguments in various registers.
1766   // To simplify the rest of the code, we expect the arguments to reside at these known
1767   // registers, and we additionally check the placement here in case calling convention ever
1768   // changes.
1769   Register reg_cont_obj   = c_rarg1;
1770   Register reg_is_cont    = c_rarg2;
1771   Register reg_is_virtual = c_rarg3;
1772 
1773   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1774   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1775   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1776 
1777   // Utility methods kill rax, make sure there are no collisions
1778   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1779 
1780   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1781                          relocInfo::static_call_type);
1782 
1783   address start = __ pc();
1784 
1785   Label L_thaw, L_exit;
1786 
1787   // i2i entry used at interp_only_mode only
1788   interpreted_entry_offset = __ pc() - start;
1789   {
1790 #ifdef ASSERT
1791     Label is_interp_only;
1792     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1793     __ jcc(Assembler::notEqual, is_interp_only);
1794     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1795     __ bind(is_interp_only);
1796 #endif
1797 
1798     __ pop(rax); // return address
1799     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1800     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1801     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1802     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1803     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1804     __ push(rax); // return address
1805     __ push_cont_fastpath();
1806 
1807     __ enter();
1808 
1809     stack_slots = 2; // will be adjusted in setup
1810     OopMap* map = continuation_enter_setup(masm, stack_slots);
1811     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1812     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1813 
1814     __ verify_oop(reg_cont_obj);
1815 
1816     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1817 
1818     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1819     __ testptr(reg_is_cont, reg_is_cont);
1820     __ jcc(Assembler::notZero, L_thaw);
1821 
1822     // --- Resolve path
1823 
1824     // Make sure the call is patchable
1825     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1826     // Emit stub for static call
1827     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1828     if (stub == nullptr) {
1829       fatal("CodeCache is full at gen_continuation_enter");
1830     }
1831     __ call(resolve);
1832     oop_maps->add_gc_map(__ pc() - start, map);
1833     __ post_call_nop();
1834 
1835     __ jmp(L_exit);
1836   }
1837 
1838   // compiled entry
1839   __ align(CodeEntryAlignment);
1840   compiled_entry_offset = __ pc() - start;
1841   __ enter();
1842 
1843   stack_slots = 2; // will be adjusted in setup
1844   OopMap* map = continuation_enter_setup(masm, stack_slots);
1845 
1846   // Frame is now completed as far as size and linkage.
1847   frame_complete = __ pc() - start;
1848 
1849   __ verify_oop(reg_cont_obj);
1850 
1851   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1852 
1853   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1854   __ testptr(reg_is_cont, reg_is_cont);
1855   __ jccb(Assembler::notZero, L_thaw);
1856 
1857   // --- call Continuation.enter(Continuation c, boolean isContinue)
1858 
1859   // Make sure the call is patchable
1860   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1861 
1862   // Emit stub for static call
1863   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1864   if (stub == nullptr) {
1865     fatal("CodeCache is full at gen_continuation_enter");
1866   }
1867 
1868   // The call needs to be resolved. There's a special case for this in
1869   // SharedRuntime::find_callee_info_helper() which calls
1870   // LinkResolver::resolve_continuation_enter() which resolves the call to
1871   // Continuation.enter(Continuation c, boolean isContinue).
1872   __ call(resolve);
1873 
1874   oop_maps->add_gc_map(__ pc() - start, map);
1875   __ post_call_nop();
1876 
1877   __ jmpb(L_exit);
1878 
1879   // --- Thawing path
1880 
1881   __ bind(L_thaw);
1882 
1883   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1884 
1885   ContinuationEntry::_return_pc_offset = __ pc() - start;
1886   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1887   __ post_call_nop();
1888 
1889   // --- Normal exit (resolve/thawing)
1890 
1891   __ bind(L_exit);
1892 
1893   continuation_enter_cleanup(masm);
1894   __ pop(rbp);
1895   __ ret(0);
1896 
1897   // --- Exception handling path
1898 
1899   exception_offset = __ pc() - start;
1900 
1901   continuation_enter_cleanup(masm);
1902   __ pop(rbp);
1903 
1904   __ movptr(c_rarg0, r15_thread);
1905   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1906 
1907   // rax still holds the original exception oop, save it before the call
1908   __ push(rax);
1909 
1910   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1911   __ movptr(rbx, rax);
1912 
1913   // Continue at exception handler:
1914   //   rax: exception oop
1915   //   rbx: exception handler
1916   //   rdx: exception pc
1917   __ pop(rax);
1918   __ verify_oop(rax);
1919   __ pop(rdx);
1920   __ jmp(rbx);
1921 }
1922 
1923 static void gen_continuation_yield(MacroAssembler* masm,
1924                                    const VMRegPair* regs,
1925                                    OopMapSet* oop_maps,
1926                                    int& frame_complete,
1927                                    int& stack_slots,
1928                                    int& compiled_entry_offset) {
1929   enum layout {
1930     rbp_off,
1931     rbpH_off,
1932     return_off,
1933     return_off2,
1934     framesize // inclusive of return address
1935   };
1936   stack_slots = framesize /  VMRegImpl::slots_per_word;
1937   assert(stack_slots == 2, "recheck layout");
1938 
1939   address start = __ pc();
1940   compiled_entry_offset = __ pc() - start;
1941   __ enter();
1942   address the_pc = __ pc();
1943 
1944   frame_complete = the_pc - start;
1945 
1946   // This nop must be exactly at the PC we push into the frame info.
1947   // We use this nop for fast CodeBlob lookup, associate the OopMap
1948   // with it right away.
1949   __ post_call_nop();
1950   OopMap* map = new OopMap(framesize, 1);
1951   oop_maps->add_gc_map(frame_complete, map);
1952 
1953   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1954   __ movptr(c_rarg0, r15_thread);
1955   __ movptr(c_rarg1, rsp);
1956   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1957   __ reset_last_Java_frame(true);
1958 
1959   Label L_pinned;
1960 
1961   __ testptr(rax, rax);
1962   __ jcc(Assembler::notZero, L_pinned);
1963 
1964   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1965   continuation_enter_cleanup(masm);
1966   __ pop(rbp);
1967   __ ret(0);
1968 
1969   __ bind(L_pinned);
1970 
1971   // Pinned, return to caller
1972 
1973   // handle pending exception thrown by freeze
1974   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1975   Label ok;
1976   __ jcc(Assembler::equal, ok);
1977   __ leave();
1978   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1979   __ bind(ok);
1980 
1981   __ leave();
1982   __ ret(0);
1983 }
1984 
1985 static void gen_special_dispatch(MacroAssembler* masm,
1986                                  const methodHandle& method,
1987                                  const BasicType* sig_bt,
1988                                  const VMRegPair* regs) {
1989   verify_oop_args(masm, method, sig_bt, regs);
1990   vmIntrinsics::ID iid = method->intrinsic_id();
1991 
1992   // Now write the args into the outgoing interpreter space
1993   bool     has_receiver   = false;
1994   Register receiver_reg   = noreg;
1995   int      member_arg_pos = -1;
1996   Register member_reg     = noreg;
1997   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1998   if (ref_kind != 0) {
1999     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
2000     member_reg = rbx;  // known to be free at this point
2001     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
2002   } else if (iid == vmIntrinsics::_invokeBasic) {
2003     has_receiver = true;
2004   } else if (iid == vmIntrinsics::_linkToNative) {
2005     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
2006     member_reg = rbx;  // known to be free at this point
2007   } else {
2008     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
2009   }
2010 
2011   if (member_reg != noreg) {
2012     // Load the member_arg into register, if necessary.
2013     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
2014     VMReg r = regs[member_arg_pos].first();
2015     if (r->is_stack()) {
2016       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
2017     } else {
2018       // no data motion is needed
2019       member_reg = r->as_Register();
2020     }
2021   }
2022 
2023   if (has_receiver) {
2024     // Make sure the receiver is loaded into a register.
2025     assert(method->size_of_parameters() > 0, "oob");
2026     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
2027     VMReg r = regs[0].first();
2028     assert(r->is_valid(), "bad receiver arg");
2029     if (r->is_stack()) {
2030       // Porting note:  This assumes that compiled calling conventions always
2031       // pass the receiver oop in a register.  If this is not true on some
2032       // platform, pick a temp and load the receiver from stack.
2033       fatal("receiver always in a register");
2034       receiver_reg = j_rarg0;  // known to be free at this point
2035       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
2036     } else {
2037       // no data motion is needed
2038       receiver_reg = r->as_Register();
2039     }
2040   }
2041 
2042   // Figure out which address we are really jumping to:
2043   MethodHandles::generate_method_handle_dispatch(masm, iid,
2044                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
2045 }
2046 
2047 // ---------------------------------------------------------------------------
2048 // Generate a native wrapper for a given method.  The method takes arguments
2049 // in the Java compiled code convention, marshals them to the native
2050 // convention (handlizes oops, etc), transitions to native, makes the call,
2051 // returns to java state (possibly blocking), unhandlizes any result and
2052 // returns.
2053 //
2054 // Critical native functions are a shorthand for the use of
2055 // GetPrimtiveArrayCritical and disallow the use of any other JNI
2056 // functions.  The wrapper is expected to unpack the arguments before
2057 // passing them to the callee. Critical native functions leave the state _in_Java,
2058 // since they cannot stop for GC.
2059 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
2060 // block and the check for pending exceptions it's impossible for them
2061 // to be thrown.
2062 //
2063 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
2064                                                 const methodHandle& method,
2065                                                 int compile_id,
2066                                                 BasicType* in_sig_bt,
2067                                                 VMRegPair* in_regs,
2068                                                 BasicType ret_type) {
2069   if (method->is_continuation_native_intrinsic()) {
2070     int exception_offset = -1;
2071     OopMapSet* oop_maps = new OopMapSet();
2072     int frame_complete = -1;
2073     int stack_slots = -1;
2074     int interpreted_entry_offset = -1;
2075     int vep_offset = -1;
2076     if (method->is_continuation_enter_intrinsic()) {
2077       gen_continuation_enter(masm,
2078                              in_regs,
2079                              exception_offset,
2080                              oop_maps,
2081                              frame_complete,
2082                              stack_slots,
2083                              interpreted_entry_offset,
2084                              vep_offset);
2085     } else if (method->is_continuation_yield_intrinsic()) {
2086       gen_continuation_yield(masm,
2087                              in_regs,
2088                              oop_maps,
2089                              frame_complete,
2090                              stack_slots,
2091                              vep_offset);
2092     } else {
2093       guarantee(false, "Unknown Continuation native intrinsic");
2094     }
2095 
2096 #ifdef ASSERT
2097     if (method->is_continuation_enter_intrinsic()) {
2098       assert(interpreted_entry_offset != -1, "Must be set");
2099       assert(exception_offset != -1,         "Must be set");
2100     } else {
2101       assert(interpreted_entry_offset == -1, "Must be unset");
2102       assert(exception_offset == -1,         "Must be unset");
2103     }
2104     assert(frame_complete != -1,    "Must be set");
2105     assert(stack_slots != -1,       "Must be set");
2106     assert(vep_offset != -1,        "Must be set");
2107 #endif
2108 
2109     __ flush();
2110     nmethod* nm = nmethod::new_native_nmethod(method,
2111                                               compile_id,
2112                                               masm->code(),
2113                                               vep_offset,
2114                                               frame_complete,
2115                                               stack_slots,
2116                                               in_ByteSize(-1),
2117                                               in_ByteSize(-1),
2118                                               oop_maps,
2119                                               exception_offset);
2120     if (nm == nullptr) return nm;
2121     if (method->is_continuation_enter_intrinsic()) {
2122       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2123     } else if (method->is_continuation_yield_intrinsic()) {
2124       _cont_doYield_stub = nm;
2125     }
2126     return nm;
2127   }
2128 
2129   if (method->is_method_handle_intrinsic()) {
2130     vmIntrinsics::ID iid = method->intrinsic_id();
2131     intptr_t start = (intptr_t)__ pc();
2132     int vep_offset = ((intptr_t)__ pc()) - start;
2133     gen_special_dispatch(masm,
2134                          method,
2135                          in_sig_bt,
2136                          in_regs);
2137     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
2138     __ flush();
2139     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
2140     return nmethod::new_native_nmethod(method,
2141                                        compile_id,
2142                                        masm->code(),
2143                                        vep_offset,
2144                                        frame_complete,
2145                                        stack_slots / VMRegImpl::slots_per_word,
2146                                        in_ByteSize(-1),
2147                                        in_ByteSize(-1),
2148                                        nullptr);
2149   }
2150   address native_func = method->native_function();
2151   assert(native_func != nullptr, "must have function");
2152 
2153   // An OopMap for lock (and class if static)
2154   OopMapSet *oop_maps = new OopMapSet();
2155   intptr_t start = (intptr_t)__ pc();
2156 
2157   // We have received a description of where all the java arg are located
2158   // on entry to the wrapper. We need to convert these args to where
2159   // the jni function will expect them. To figure out where they go
2160   // we convert the java signature to a C signature by inserting
2161   // the hidden arguments as arg[0] and possibly arg[1] (static method)
2162 
2163   const int total_in_args = method->size_of_parameters();
2164   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2165 
2166   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2167   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2168   BasicType* in_elem_bt = nullptr;
2169 
2170   int argc = 0;
2171   out_sig_bt[argc++] = T_ADDRESS;
2172   if (method->is_static()) {
2173     out_sig_bt[argc++] = T_OBJECT;
2174   }
2175 
2176   for (int i = 0; i < total_in_args ; i++ ) {
2177     out_sig_bt[argc++] = in_sig_bt[i];
2178   }
2179 
2180   // Now figure out where the args must be stored and how much stack space
2181   // they require.
2182   int out_arg_slots;
2183   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2184 
2185   // Compute framesize for the wrapper.  We need to handlize all oops in
2186   // incoming registers
2187 
2188   // Calculate the total number of stack slots we will need.
2189 
2190   // First count the abi requirement plus all of the outgoing args
2191   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2192 
2193   // Now the space for the inbound oop handle area
2194   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
2195 
2196   int oop_handle_offset = stack_slots;
2197   stack_slots += total_save_slots;
2198 
2199   // Now any space we need for handlizing a klass if static method
2200 
2201   int klass_slot_offset = 0;
2202   int klass_offset = -1;
2203   int lock_slot_offset = 0;
2204   bool is_static = false;
2205 
2206   if (method->is_static()) {
2207     klass_slot_offset = stack_slots;
2208     stack_slots += VMRegImpl::slots_per_word;
2209     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2210     is_static = true;
2211   }
2212 
2213   // Plus a lock if needed
2214 
2215   if (method->is_synchronized()) {
2216     lock_slot_offset = stack_slots;
2217     stack_slots += VMRegImpl::slots_per_word;
2218   }
2219 
2220   // Now a place (+2) to save return values or temp during shuffling
2221   // + 4 for return address (which we own) and saved rbp
2222   stack_slots += 6;
2223 
2224   // Ok The space we have allocated will look like:
2225   //
2226   //
2227   // FP-> |                     |
2228   //      |---------------------|
2229   //      | 2 slots for moves   |
2230   //      |---------------------|
2231   //      | lock box (if sync)  |
2232   //      |---------------------| <- lock_slot_offset
2233   //      | klass (if static)   |
2234   //      |---------------------| <- klass_slot_offset
2235   //      | oopHandle area      |
2236   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2237   //      | outbound memory     |
2238   //      | based arguments     |
2239   //      |                     |
2240   //      |---------------------|
2241   //      |                     |
2242   // SP-> | out_preserved_slots |
2243   //
2244   //
2245 
2246 
2247   // Now compute actual number of stack words we need rounding to make
2248   // stack properly aligned.
2249   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2250 
2251   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2252 
2253   // First thing make an ic check to see if we should even be here
2254 
2255   // We are free to use all registers as temps without saving them and
2256   // restoring them except rbp. rbp is the only callee save register
2257   // as far as the interpreter and the compiler(s) are concerned.
2258 
2259   const Register receiver = j_rarg0;
2260 
2261   Label exception_pending;
2262 
2263   assert_different_registers(receiver, rscratch1, rscratch2);
2264   __ verify_oop(receiver);
2265   __ ic_check(8 /* end_alignment */);
2266 
2267   int vep_offset = ((intptr_t)__ pc()) - start;
2268 
2269   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2270     Label L_skip_barrier;
2271     Register klass = r10;
2272     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2273     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2274 
2275     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2276 
2277     __ bind(L_skip_barrier);
2278   }
2279 
2280 #ifdef COMPILER1
2281   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2282   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2283     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2284   }
2285 #endif // COMPILER1
2286 
2287   // The instruction at the verified entry point must be 5 bytes or longer
2288   // because it can be patched on the fly by make_non_entrant. The stack bang
2289   // instruction fits that requirement.
2290 
2291   // Generate stack overflow check
2292   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2293 
2294   // Generate a new frame for the wrapper.
2295   __ enter();
2296   // -2 because return address is already present and so is saved rbp
2297   __ subptr(rsp, stack_size - 2*wordSize);
2298 
2299   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2300   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2301   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2302 
2303   // Frame is now completed as far as size and linkage.
2304   int frame_complete = ((intptr_t)__ pc()) - start;
2305 
2306 #ifdef ASSERT
2307   __ check_stack_alignment(rsp, "improperly aligned stack");
2308 #endif /* ASSERT */
2309 
2310 
2311   // We use r14 as the oop handle for the receiver/klass
2312   // It is callee save so it survives the call to native
2313 
2314   const Register oop_handle_reg = r14;
2315 
2316   //
2317   // We immediately shuffle the arguments so that any vm call we have to
2318   // make from here on out (sync slow path, jvmti, etc.) we will have
2319   // captured the oops from our caller and have a valid oopMap for
2320   // them.
2321 
2322   // -----------------
2323   // The Grand Shuffle
2324 
2325   // The Java calling convention is either equal (linux) or denser (win64) than the
2326   // c calling convention. However the because of the jni_env argument the c calling
2327   // convention always has at least one more (and two for static) arguments than Java.
2328   // Therefore if we move the args from java -> c backwards then we will never have
2329   // a register->register conflict and we don't have to build a dependency graph
2330   // and figure out how to break any cycles.
2331   //
2332 
2333   // Record esp-based slot for receiver on stack for non-static methods
2334   int receiver_offset = -1;
2335 
2336   // This is a trick. We double the stack slots so we can claim
2337   // the oops in the caller's frame. Since we are sure to have
2338   // more args than the caller doubling is enough to make
2339   // sure we can capture all the incoming oop args from the
2340   // caller.
2341   //
2342   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2343 
2344   // Mark location of rbp (someday)
2345   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2346 
2347   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2348   // All inbound args are referenced based on rbp and all outbound args via rsp.
2349 
2350 
2351 #ifdef ASSERT
2352   bool reg_destroyed[Register::number_of_registers];
2353   bool freg_destroyed[XMMRegister::number_of_registers];
2354   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2355     reg_destroyed[r] = false;
2356   }
2357   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2358     freg_destroyed[f] = false;
2359   }
2360 
2361 #endif /* ASSERT */
2362 
2363   // For JNI natives the incoming and outgoing registers are offset upwards.
2364   GrowableArray<int> arg_order(2 * total_in_args);
2365 
2366   VMRegPair tmp_vmreg;
2367   tmp_vmreg.set2(rbx->as_VMReg());
2368 
2369   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2370     arg_order.push(i);
2371     arg_order.push(c_arg);
2372   }
2373 
2374   int temploc = -1;
2375   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2376     int i = arg_order.at(ai);
2377     int c_arg = arg_order.at(ai + 1);
2378     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2379 #ifdef ASSERT
2380     if (in_regs[i].first()->is_Register()) {
2381       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2382     } else if (in_regs[i].first()->is_XMMRegister()) {
2383       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2384     }
2385     if (out_regs[c_arg].first()->is_Register()) {
2386       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2387     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2388       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2389     }
2390 #endif /* ASSERT */
2391     switch (in_sig_bt[i]) {
2392       case T_ARRAY:
2393       case T_OBJECT:
2394         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2395                     ((i == 0) && (!is_static)),
2396                     &receiver_offset);
2397         break;
2398       case T_VOID:
2399         break;
2400 
2401       case T_FLOAT:
2402         __ float_move(in_regs[i], out_regs[c_arg]);
2403           break;
2404 
2405       case T_DOUBLE:
2406         assert( i + 1 < total_in_args &&
2407                 in_sig_bt[i + 1] == T_VOID &&
2408                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2409         __ double_move(in_regs[i], out_regs[c_arg]);
2410         break;
2411 
2412       case T_LONG :
2413         __ long_move(in_regs[i], out_regs[c_arg]);
2414         break;
2415 
2416       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2417 
2418       default:
2419         __ move32_64(in_regs[i], out_regs[c_arg]);
2420     }
2421   }
2422 
2423   int c_arg;
2424 
2425   // Pre-load a static method's oop into r14.  Used both by locking code and
2426   // the normal JNI call code.
2427   // point c_arg at the first arg that is already loaded in case we
2428   // need to spill before we call out
2429   c_arg = total_c_args - total_in_args;
2430 
2431   if (method->is_static()) {
2432 
2433     //  load oop into a register
2434     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2435 
2436     // Now handlize the static class mirror it's known not-null.
2437     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2438     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2439 
2440     // Now get the handle
2441     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2442     // store the klass handle as second argument
2443     __ movptr(c_rarg1, oop_handle_reg);
2444     // and protect the arg if we must spill
2445     c_arg--;
2446   }
2447 
2448   // Change state to native (we save the return address in the thread, since it might not
2449   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2450   // points into the right code segment. It does not have to be the correct return pc.
2451   // We use the same pc/oopMap repeatedly when we call out
2452 
2453   intptr_t the_pc = (intptr_t) __ pc();
2454   oop_maps->add_gc_map(the_pc - start, map);
2455 
2456   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2457 
2458 
2459   // We have all of the arguments setup at this point. We must not touch any register
2460   // argument registers at this point (what if we save/restore them there are no oop?
2461 
2462   if (DTraceMethodProbes) {
2463     // protect the args we've loaded
2464     save_args(masm, total_c_args, c_arg, out_regs);
2465     __ mov_metadata(c_rarg1, method());
2466     __ call_VM_leaf(
2467       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2468       r15_thread, c_rarg1);
2469     restore_args(masm, total_c_args, c_arg, out_regs);
2470   }
2471 
2472   // RedefineClasses() tracing support for obsolete method entry
2473   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2474     // protect the args we've loaded
2475     save_args(masm, total_c_args, c_arg, out_regs);
2476     __ mov_metadata(c_rarg1, method());
2477     __ call_VM_leaf(
2478       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2479       r15_thread, c_rarg1);
2480     restore_args(masm, total_c_args, c_arg, out_regs);
2481   }
2482 
2483   // Lock a synchronized method
2484 
2485   // Register definitions used by locking and unlocking
2486 
2487   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2488   const Register obj_reg  = rbx;  // Will contain the oop
2489   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2490   const Register old_hdr  = r13;  // value of old header at unlock time
2491 
2492   Label slow_path_lock;
2493   Label lock_done;
2494 
2495   if (method->is_synchronized()) {
2496     Label count_mon;
2497 
2498     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2499 
2500     // Get the handle (the 2nd argument)
2501     __ mov(oop_handle_reg, c_rarg1);
2502 
2503     // Get address of the box
2504 
2505     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2506 
2507     // Load the oop from the handle
2508     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2509 
2510     if (LockingMode == LM_MONITOR) {
2511       __ jmp(slow_path_lock);
2512     } else if (LockingMode == LM_LEGACY) {
2513       // Load immediate 1 into swap_reg %rax
2514       __ movl(swap_reg, 1);
2515 
2516       // Load (object->mark() | 1) into swap_reg %rax
2517       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2518       if (EnableValhalla) {
2519         // Mask inline_type bit such that we go to the slow path if object is an inline type
2520         __ andptr(swap_reg, ~((int) markWord::inline_type_bit_in_place));
2521       }
2522 
2523       // Save (object->mark() | 1) into BasicLock's displaced header
2524       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2525 
2526       // src -> dest iff dest == rax else rax <- dest
2527       __ lock();
2528       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2529       __ jcc(Assembler::equal, count_mon);
2530 
2531       // Hmm should this move to the slow path code area???
2532 
2533       // Test if the oopMark is an obvious stack pointer, i.e.,
2534       //  1) (mark & 3) == 0, and
2535       //  2) rsp <= mark < mark + os::pagesize()
2536       // These 3 tests can be done by evaluating the following
2537       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2538       // assuming both stack pointer and pagesize have their
2539       // least significant 2 bits clear.
2540       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2541 
2542       __ subptr(swap_reg, rsp);
2543       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2544 
2545       // Save the test result, for recursive case, the result is zero
2546       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2547       __ jcc(Assembler::notEqual, slow_path_lock);
2548     } else {
2549       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2550       __ lightweight_lock(lock_reg, obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2551     }
2552     __ bind(count_mon);
2553     __ inc_held_monitor_count();
2554 
2555     // Slow path will re-enter here
2556     __ bind(lock_done);
2557   }
2558 
2559   // Finally just about ready to make the JNI call
2560 
2561   // get JNIEnv* which is first argument to native
2562   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2563 
2564   // Now set thread in native
2565   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2566 
2567   __ call(RuntimeAddress(native_func));
2568 
2569   // Verify or restore cpu control state after JNI call
2570   __ restore_cpu_control_state_after_jni(rscratch1);
2571 
2572   // Unpack native results.
2573   switch (ret_type) {
2574   case T_BOOLEAN: __ c2bool(rax);            break;
2575   case T_CHAR   : __ movzwl(rax, rax);      break;
2576   case T_BYTE   : __ sign_extend_byte (rax); break;
2577   case T_SHORT  : __ sign_extend_short(rax); break;
2578   case T_INT    : /* nothing to do */        break;
2579   case T_DOUBLE :
2580   case T_FLOAT  :
2581     // Result is in xmm0 we'll save as needed
2582     break;
2583   case T_ARRAY:                 // Really a handle
2584   case T_OBJECT:                // Really a handle
2585       break; // can't de-handlize until after safepoint check
2586   case T_VOID: break;
2587   case T_LONG: break;
2588   default       : ShouldNotReachHere();
2589   }
2590 
2591   Label after_transition;
2592 
2593   // Switch thread to "native transition" state before reading the synchronization state.
2594   // This additional state is necessary because reading and testing the synchronization
2595   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2596   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2597   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2598   //     Thread A is resumed to finish this native method, but doesn't block here since it
2599   //     didn't see any synchronization is progress, and escapes.
2600   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2601 
2602   // Force this write out before the read below
2603   if (!UseSystemMemoryBarrier) {
2604     __ membar(Assembler::Membar_mask_bits(
2605               Assembler::LoadLoad | Assembler::LoadStore |
2606               Assembler::StoreLoad | Assembler::StoreStore));
2607   }
2608 
2609   // check for safepoint operation in progress and/or pending suspend requests
2610   {
2611     Label Continue;
2612     Label slow_path;
2613 
2614     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2615 
2616     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2617     __ jcc(Assembler::equal, Continue);
2618     __ bind(slow_path);
2619 
2620     // Don't use call_VM as it will see a possible pending exception and forward it
2621     // and never return here preventing us from clearing _last_native_pc down below.
2622     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2623     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2624     // by hand.
2625     //
2626     __ vzeroupper();
2627     save_native_result(masm, ret_type, stack_slots);
2628     __ mov(c_rarg0, r15_thread);
2629     __ mov(r12, rsp); // remember sp
2630     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2631     __ andptr(rsp, -16); // align stack as required by ABI
2632     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2633     __ mov(rsp, r12); // restore sp
2634     __ reinit_heapbase();
2635     // Restore any method result value
2636     restore_native_result(masm, ret_type, stack_slots);
2637     __ bind(Continue);
2638   }
2639 
2640   // change thread state
2641   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2642   __ bind(after_transition);
2643 
2644   Label reguard;
2645   Label reguard_done;
2646   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2647   __ jcc(Assembler::equal, reguard);
2648   __ bind(reguard_done);
2649 
2650   // native result if any is live
2651 
2652   // Unlock
2653   Label slow_path_unlock;
2654   Label unlock_done;
2655   if (method->is_synchronized()) {
2656 
2657     Label fast_done;
2658 
2659     // Get locked oop from the handle we passed to jni
2660     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2661 
2662     if (LockingMode == LM_LEGACY) {
2663       Label not_recur;
2664       // Simple recursive lock?
2665       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2666       __ jcc(Assembler::notEqual, not_recur);
2667       __ dec_held_monitor_count();
2668       __ jmpb(fast_done);
2669       __ bind(not_recur);
2670     }
2671 
2672     // Must save rax if it is live now because cmpxchg must use it
2673     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2674       save_native_result(masm, ret_type, stack_slots);
2675     }
2676 
2677     if (LockingMode == LM_MONITOR) {
2678       __ jmp(slow_path_unlock);
2679     } else if (LockingMode == LM_LEGACY) {
2680       // get address of the stack lock
2681       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2682       //  get old displaced header
2683       __ movptr(old_hdr, Address(rax, 0));
2684 
2685       // Atomic swap old header if oop still contains the stack lock
2686       __ lock();
2687       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2688       __ jcc(Assembler::notEqual, slow_path_unlock);
2689       __ dec_held_monitor_count();
2690     } else {
2691       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2692       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2693       __ dec_held_monitor_count();
2694     }
2695 
2696     // slow path re-enters here
2697     __ bind(unlock_done);
2698     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2699       restore_native_result(masm, ret_type, stack_slots);
2700     }
2701 
2702     __ bind(fast_done);
2703   }
2704   if (DTraceMethodProbes) {
2705     save_native_result(masm, ret_type, stack_slots);
2706     __ mov_metadata(c_rarg1, method());
2707     __ call_VM_leaf(
2708          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2709          r15_thread, c_rarg1);
2710     restore_native_result(masm, ret_type, stack_slots);
2711   }
2712 
2713   __ reset_last_Java_frame(false);
2714 
2715   // Unbox oop result, e.g. JNIHandles::resolve value.
2716   if (is_reference_type(ret_type)) {
2717     __ resolve_jobject(rax /* value */,
2718                        r15_thread /* thread */,
2719                        rcx /* tmp */);
2720   }
2721 
2722   if (CheckJNICalls) {
2723     // clear_pending_jni_exception_check
2724     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2725   }
2726 
2727   // reset handle block
2728   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2729   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2730 
2731   // pop our frame
2732 
2733   __ leave();
2734 
2735   // Any exception pending?
2736   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2737   __ jcc(Assembler::notEqual, exception_pending);
2738 
2739   // Return
2740 
2741   __ ret(0);
2742 
2743   // Unexpected paths are out of line and go here
2744 
2745   // forward the exception
2746   __ bind(exception_pending);
2747 
2748   // and forward the exception
2749   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2750 
2751   // Slow path locking & unlocking
2752   if (method->is_synchronized()) {
2753 
2754     // BEGIN Slow path lock
2755     __ bind(slow_path_lock);
2756 
2757     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2758     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2759 
2760     // protect the args we've loaded
2761     save_args(masm, total_c_args, c_arg, out_regs);
2762 
2763     __ mov(c_rarg0, obj_reg);
2764     __ mov(c_rarg1, lock_reg);
2765     __ mov(c_rarg2, r15_thread);
2766 
2767     // Not a leaf but we have last_Java_frame setup as we want
2768     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2769     restore_args(masm, total_c_args, c_arg, out_regs);
2770 
2771 #ifdef ASSERT
2772     { Label L;
2773     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2774     __ jcc(Assembler::equal, L);
2775     __ stop("no pending exception allowed on exit from monitorenter");
2776     __ bind(L);
2777     }
2778 #endif
2779     __ jmp(lock_done);
2780 
2781     // END Slow path lock
2782 
2783     // BEGIN Slow path unlock
2784     __ bind(slow_path_unlock);
2785 
2786     // If we haven't already saved the native result we must save it now as xmm registers
2787     // are still exposed.
2788     __ vzeroupper();
2789     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2790       save_native_result(masm, ret_type, stack_slots);
2791     }
2792 
2793     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2794 
2795     __ mov(c_rarg0, obj_reg);
2796     __ mov(c_rarg2, r15_thread);
2797     __ mov(r12, rsp); // remember sp
2798     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2799     __ andptr(rsp, -16); // align stack as required by ABI
2800 
2801     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2802     // NOTE that obj_reg == rbx currently
2803     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2804     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2805 
2806     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2807     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2808     __ mov(rsp, r12); // restore sp
2809     __ reinit_heapbase();
2810 #ifdef ASSERT
2811     {
2812       Label L;
2813       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2814       __ jcc(Assembler::equal, L);
2815       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2816       __ bind(L);
2817     }
2818 #endif /* ASSERT */
2819 
2820     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2821 
2822     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2823       restore_native_result(masm, ret_type, stack_slots);
2824     }
2825     __ jmp(unlock_done);
2826 
2827     // END Slow path unlock
2828 
2829   } // synchronized
2830 
2831   // SLOW PATH Reguard the stack if needed
2832 
2833   __ bind(reguard);
2834   __ vzeroupper();
2835   save_native_result(masm, ret_type, stack_slots);
2836   __ mov(r12, rsp); // remember sp
2837   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2838   __ andptr(rsp, -16); // align stack as required by ABI
2839   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2840   __ mov(rsp, r12); // restore sp
2841   __ reinit_heapbase();
2842   restore_native_result(masm, ret_type, stack_slots);
2843   // and continue
2844   __ jmp(reguard_done);
2845 
2846 
2847 
2848   __ flush();
2849 
2850   nmethod *nm = nmethod::new_native_nmethod(method,
2851                                             compile_id,
2852                                             masm->code(),
2853                                             vep_offset,
2854                                             frame_complete,
2855                                             stack_slots / VMRegImpl::slots_per_word,
2856                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2857                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2858                                             oop_maps);
2859 
2860   return nm;
2861 }
2862 
2863 // this function returns the adjust size (in number of words) to a c2i adapter
2864 // activation for use during deoptimization
2865 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2866   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2867 }
2868 
2869 
2870 uint SharedRuntime::out_preserve_stack_slots() {
2871   return 0;
2872 }
2873 
2874 
2875 // Number of stack slots between incoming argument block and the start of
2876 // a new frame.  The PROLOG must add this many slots to the stack.  The
2877 // EPILOG must remove this many slots.  amd64 needs two slots for
2878 // return address.
2879 uint SharedRuntime::in_preserve_stack_slots() {
2880   return 4 + 2 * VerifyStackAtCalls;
2881 }
2882 
2883 //------------------------------generate_deopt_blob----------------------------
2884 void SharedRuntime::generate_deopt_blob() {
2885   // Allocate space for the code
2886   ResourceMark rm;
2887   // Setup code generation tools
2888   int pad = 0;
2889   if (UseAVX > 2) {
2890     pad += 1024;
2891   }
2892   if (UseAPX) {
2893     pad += 1024;
2894   }
2895 #if INCLUDE_JVMCI
2896   if (EnableJVMCI) {
2897     pad += 512; // Increase the buffer size when compiling for JVMCI
2898   }
2899 #endif
2900   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2901   MacroAssembler* masm = new MacroAssembler(&buffer);
2902   int frame_size_in_words;
2903   OopMap* map = nullptr;
2904   OopMapSet *oop_maps = new OopMapSet();
2905 
2906   // -------------
2907   // This code enters when returning to a de-optimized nmethod.  A return
2908   // address has been pushed on the stack, and return values are in
2909   // registers.
2910   // If we are doing a normal deopt then we were called from the patched
2911   // nmethod from the point we returned to the nmethod. So the return
2912   // address on the stack is wrong by NativeCall::instruction_size
2913   // We will adjust the value so it looks like we have the original return
2914   // address on the stack (like when we eagerly deoptimized).
2915   // In the case of an exception pending when deoptimizing, we enter
2916   // with a return address on the stack that points after the call we patched
2917   // into the exception handler. We have the following register state from,
2918   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2919   //    rax: exception oop
2920   //    rbx: exception handler
2921   //    rdx: throwing pc
2922   // So in this case we simply jam rdx into the useless return address and
2923   // the stack looks just like we want.
2924   //
2925   // At this point we need to de-opt.  We save the argument return
2926   // registers.  We call the first C routine, fetch_unroll_info().  This
2927   // routine captures the return values and returns a structure which
2928   // describes the current frame size and the sizes of all replacement frames.
2929   // The current frame is compiled code and may contain many inlined
2930   // functions, each with their own JVM state.  We pop the current frame, then
2931   // push all the new frames.  Then we call the C routine unpack_frames() to
2932   // populate these frames.  Finally unpack_frames() returns us the new target
2933   // address.  Notice that callee-save registers are BLOWN here; they have
2934   // already been captured in the vframeArray at the time the return PC was
2935   // patched.
2936   address start = __ pc();
2937   Label cont;
2938 
2939   // Prolog for non exception case!
2940 
2941   // Save everything in sight.
2942   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2943 
2944   // Normal deoptimization.  Save exec mode for unpack_frames.
2945   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2946   __ jmp(cont);
2947 
2948   int reexecute_offset = __ pc() - start;
2949 #if INCLUDE_JVMCI && !defined(COMPILER1)
2950   if (EnableJVMCI && UseJVMCICompiler) {
2951     // JVMCI does not use this kind of deoptimization
2952     __ should_not_reach_here();
2953   }
2954 #endif
2955 
2956   // Reexecute case
2957   // return address is the pc describes what bci to do re-execute at
2958 
2959   // No need to update map as each call to save_live_registers will produce identical oopmap
2960   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2961 
2962   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2963   __ jmp(cont);
2964 
2965 #if INCLUDE_JVMCI
2966   Label after_fetch_unroll_info_call;
2967   int implicit_exception_uncommon_trap_offset = 0;
2968   int uncommon_trap_offset = 0;
2969 
2970   if (EnableJVMCI) {
2971     implicit_exception_uncommon_trap_offset = __ pc() - start;
2972 
2973     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2974     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2975 
2976     uncommon_trap_offset = __ pc() - start;
2977 
2978     // Save everything in sight.
2979     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2980     // fetch_unroll_info needs to call last_java_frame()
2981     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2982 
2983     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2984     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2985 
2986     __ movl(r14, Deoptimization::Unpack_reexecute);
2987     __ mov(c_rarg0, r15_thread);
2988     __ movl(c_rarg2, r14); // exec mode
2989     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2990     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2991 
2992     __ reset_last_Java_frame(false);
2993 
2994     __ jmp(after_fetch_unroll_info_call);
2995   } // EnableJVMCI
2996 #endif // INCLUDE_JVMCI
2997 
2998   int exception_offset = __ pc() - start;
2999 
3000   // Prolog for exception case
3001 
3002   // all registers are dead at this entry point, except for rax, and
3003   // rdx which contain the exception oop and exception pc
3004   // respectively.  Set them in TLS and fall thru to the
3005   // unpack_with_exception_in_tls entry point.
3006 
3007   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3008   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
3009 
3010   int exception_in_tls_offset = __ pc() - start;
3011 
3012   // new implementation because exception oop is now passed in JavaThread
3013 
3014   // Prolog for exception case
3015   // All registers must be preserved because they might be used by LinearScan
3016   // Exceptiop oop and throwing PC are passed in JavaThread
3017   // tos: stack at point of call to method that threw the exception (i.e. only
3018   // args are on the stack, no return address)
3019 
3020   // make room on stack for the return address
3021   // It will be patched later with the throwing pc. The correct value is not
3022   // available now because loading it from memory would destroy registers.
3023   __ push(0);
3024 
3025   // Save everything in sight.
3026   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
3027 
3028   // Now it is safe to overwrite any register
3029 
3030   // Deopt during an exception.  Save exec mode for unpack_frames.
3031   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
3032 
3033   // load throwing pc from JavaThread and patch it as the return address
3034   // of the current frame. Then clear the field in JavaThread
3035 
3036   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3037   __ movptr(Address(rbp, wordSize), rdx);
3038   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3039 
3040 #ifdef ASSERT
3041   // verify that there is really an exception oop in JavaThread
3042   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3043   __ verify_oop(rax);
3044 
3045   // verify that there is no pending exception
3046   Label no_pending_exception;
3047   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3048   __ testptr(rax, rax);
3049   __ jcc(Assembler::zero, no_pending_exception);
3050   __ stop("must not have pending exception here");
3051   __ bind(no_pending_exception);
3052 #endif
3053 
3054   __ bind(cont);
3055 
3056   // Call C code.  Need thread and this frame, but NOT official VM entry
3057   // crud.  We cannot block on this call, no GC can happen.
3058   //
3059   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
3060 
3061   // fetch_unroll_info needs to call last_java_frame().
3062 
3063   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3064 #ifdef ASSERT
3065   { Label L;
3066     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3067     __ jcc(Assembler::equal, L);
3068     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
3069     __ bind(L);
3070   }
3071 #endif // ASSERT
3072   __ mov(c_rarg0, r15_thread);
3073   __ movl(c_rarg1, r14); // exec_mode
3074   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
3075 
3076   // Need to have an oopmap that tells fetch_unroll_info where to
3077   // find any register it might need.
3078   oop_maps->add_gc_map(__ pc() - start, map);
3079 
3080   __ reset_last_Java_frame(false);
3081 
3082 #if INCLUDE_JVMCI
3083   if (EnableJVMCI) {
3084     __ bind(after_fetch_unroll_info_call);
3085   }
3086 #endif
3087 
3088   // Load UnrollBlock* into rdi
3089   __ mov(rdi, rax);
3090 
3091   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
3092    Label noException;
3093   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
3094   __ jcc(Assembler::notEqual, noException);
3095   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3096   // QQQ this is useless it was null above
3097   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3098   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3099   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3100 
3101   __ verify_oop(rax);
3102 
3103   // Overwrite the result registers with the exception results.
3104   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3105   // I think this is useless
3106   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
3107 
3108   __ bind(noException);
3109 
3110   // Only register save data is on the stack.
3111   // Now restore the result registers.  Everything else is either dead
3112   // or captured in the vframeArray.
3113   RegisterSaver::restore_result_registers(masm);
3114 
3115   // All of the register save area has been popped of the stack. Only the
3116   // return address remains.
3117 
3118   // Pop all the frames we must move/replace.
3119   //
3120   // Frame picture (youngest to oldest)
3121   // 1: self-frame (no frame link)
3122   // 2: deopting frame  (no frame link)
3123   // 3: caller of deopting frame (could be compiled/interpreted).
3124   //
3125   // Note: by leaving the return address of self-frame on the stack
3126   // and using the size of frame 2 to adjust the stack
3127   // when we are done the return to frame 3 will still be on the stack.
3128 
3129   // Pop deoptimized frame
3130   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3131   __ addptr(rsp, rcx);
3132 
3133   // rsp should be pointing at the return address to the caller (3)
3134 
3135   // Pick up the initial fp we should save
3136   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3137   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3138 
3139 #ifdef ASSERT
3140   // Compilers generate code that bang the stack by as much as the
3141   // interpreter would need. So this stack banging should never
3142   // trigger a fault. Verify that it does not on non product builds.
3143   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3144   __ bang_stack_size(rbx, rcx);
3145 #endif
3146 
3147   // Load address of array of frame pcs into rcx
3148   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3149 
3150   // Trash the old pc
3151   __ addptr(rsp, wordSize);
3152 
3153   // Load address of array of frame sizes into rsi
3154   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3155 
3156   // Load counter into rdx
3157   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3158 
3159   // Now adjust the caller's stack to make up for the extra locals
3160   // but record the original sp so that we can save it in the skeletal interpreter
3161   // frame and the stack walking of interpreter_sender will get the unextended sp
3162   // value and not the "real" sp value.
3163 
3164   const Register sender_sp = r8;
3165 
3166   __ mov(sender_sp, rsp);
3167   __ movl(rbx, Address(rdi,
3168                        Deoptimization::UnrollBlock::
3169                        caller_adjustment_offset()));
3170   __ subptr(rsp, rbx);
3171 
3172   // Push interpreter frames in a loop
3173   Label loop;
3174   __ bind(loop);
3175   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3176   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3177   __ pushptr(Address(rcx, 0));          // Save return address
3178   __ enter();                           // Save old & set new ebp
3179   __ subptr(rsp, rbx);                  // Prolog
3180   // This value is corrected by layout_activation_impl
3181   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3182   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3183   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3184   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3185   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3186   __ decrementl(rdx);                   // Decrement counter
3187   __ jcc(Assembler::notZero, loop);
3188   __ pushptr(Address(rcx, 0));          // Save final return address
3189 
3190   // Re-push self-frame
3191   __ enter();                           // Save old & set new ebp
3192 
3193   // Allocate a full sized register save area.
3194   // Return address and rbp are in place, so we allocate two less words.
3195   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3196 
3197   // Restore frame locals after moving the frame
3198   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3199   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3200 
3201   // Call C code.  Need thread but NOT official VM entry
3202   // crud.  We cannot block on this call, no GC can happen.  Call should
3203   // restore return values to their stack-slots with the new SP.
3204   //
3205   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3206 
3207   // Use rbp because the frames look interpreted now
3208   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3209   // Don't need the precise return PC here, just precise enough to point into this code blob.
3210   address the_pc = __ pc();
3211   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3212 
3213   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3214   __ mov(c_rarg0, r15_thread);
3215   __ movl(c_rarg1, r14); // second arg: exec_mode
3216   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3217   // Revert SP alignment after call since we're going to do some SP relative addressing below
3218   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3219 
3220   // Set an oopmap for the call site
3221   // Use the same PC we used for the last java frame
3222   oop_maps->add_gc_map(the_pc - start,
3223                        new OopMap( frame_size_in_words, 0 ));
3224 
3225   // Clear fp AND pc
3226   __ reset_last_Java_frame(true);
3227 
3228   // Collect return values
3229   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3230   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3231   // I think this is useless (throwing pc?)
3232   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3233 
3234   // Pop self-frame.
3235   __ leave();                           // Epilog
3236 
3237   // Jump to interpreter
3238   __ ret(0);
3239 
3240   // Make sure all code is generated
3241   masm->flush();
3242 
3243   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3244   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3245 #if INCLUDE_JVMCI
3246   if (EnableJVMCI) {
3247     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3248     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3249   }
3250 #endif
3251 }
3252 
3253 //------------------------------generate_handler_blob------
3254 //
3255 // Generate a special Compile2Runtime blob that saves all registers,
3256 // and setup oopmap.
3257 //
3258 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3259   assert(StubRoutines::forward_exception_entry() != nullptr,
3260          "must be generated before");
3261 
3262   ResourceMark rm;
3263   OopMapSet *oop_maps = new OopMapSet();
3264   OopMap* map;
3265 
3266   // Allocate space for the code.  Setup code generation tools.
3267   CodeBuffer buffer("handler_blob", 2348, 1024);
3268   MacroAssembler* masm = new MacroAssembler(&buffer);
3269 
3270   address start   = __ pc();
3271   address call_pc = nullptr;
3272   int frame_size_in_words;
3273   bool cause_return = (poll_type == POLL_AT_RETURN);
3274   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3275 
3276   // Make room for return address (or push it again)
3277   if (!cause_return) {
3278     __ push(rbx);
3279   }
3280 
3281   // Save registers, fpu state, and flags
3282   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3283 
3284   // The following is basically a call_VM.  However, we need the precise
3285   // address of the call in order to generate an oopmap. Hence, we do all the
3286   // work ourselves.
3287 
3288   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3289 
3290   // The return address must always be correct so that frame constructor never
3291   // sees an invalid pc.
3292 
3293   if (!cause_return) {
3294     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3295     // Additionally, rbx is a callee saved register and we can look at it later to determine
3296     // if someone changed the return address for us!
3297     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3298     __ movptr(Address(rbp, wordSize), rbx);
3299   }
3300 
3301   // Do the call
3302   __ mov(c_rarg0, r15_thread);
3303   __ call(RuntimeAddress(call_ptr));
3304 
3305   // Set an oopmap for the call site.  This oopmap will map all
3306   // oop-registers and debug-info registers as callee-saved.  This
3307   // will allow deoptimization at this safepoint to find all possible
3308   // debug-info recordings, as well as let GC find all oops.
3309 
3310   oop_maps->add_gc_map( __ pc() - start, map);
3311 
3312   Label noException;
3313 
3314   __ reset_last_Java_frame(false);
3315 
3316   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3317   __ jcc(Assembler::equal, noException);
3318 
3319   // Exception pending
3320 
3321   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3322 
3323   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3324 
3325   // No exception case
3326   __ bind(noException);
3327 
3328   Label no_adjust;
3329 #ifdef ASSERT
3330   Label bail;
3331 #endif
3332   if (!cause_return) {
3333     Label no_prefix, not_special;
3334 
3335     // If our stashed return pc was modified by the runtime we avoid touching it
3336     __ cmpptr(rbx, Address(rbp, wordSize));
3337     __ jccb(Assembler::notEqual, no_adjust);
3338 
3339     // Skip over the poll instruction.
3340     // See NativeInstruction::is_safepoint_poll()
3341     // Possible encodings:
3342     //      85 00       test   %eax,(%rax)
3343     //      85 01       test   %eax,(%rcx)
3344     //      85 02       test   %eax,(%rdx)
3345     //      85 03       test   %eax,(%rbx)
3346     //      85 06       test   %eax,(%rsi)
3347     //      85 07       test   %eax,(%rdi)
3348     //
3349     //   41 85 00       test   %eax,(%r8)
3350     //   41 85 01       test   %eax,(%r9)
3351     //   41 85 02       test   %eax,(%r10)
3352     //   41 85 03       test   %eax,(%r11)
3353     //   41 85 06       test   %eax,(%r14)
3354     //   41 85 07       test   %eax,(%r15)
3355     //
3356     //      85 04 24    test   %eax,(%rsp)
3357     //   41 85 04 24    test   %eax,(%r12)
3358     //      85 45 00    test   %eax,0x0(%rbp)
3359     //   41 85 45 00    test   %eax,0x0(%r13)
3360 
3361     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3362     __ jcc(Assembler::notEqual, no_prefix);
3363     __ addptr(rbx, 1);
3364     __ bind(no_prefix);
3365 #ifdef ASSERT
3366     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3367 #endif
3368     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3369     // r12/rsp 0x04
3370     // r13/rbp 0x05
3371     __ movzbq(rcx, Address(rbx, 1));
3372     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3373     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3374     __ cmpptr(rcx, 1);
3375     __ jcc(Assembler::above, not_special);
3376     __ addptr(rbx, 1);
3377     __ bind(not_special);
3378 #ifdef ASSERT
3379     // Verify the correct encoding of the poll we're about to skip.
3380     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3381     __ jcc(Assembler::notEqual, bail);
3382     // Mask out the modrm bits
3383     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3384     // rax encodes to 0, so if the bits are nonzero it's incorrect
3385     __ jcc(Assembler::notZero, bail);
3386 #endif
3387     // Adjust return pc forward to step over the safepoint poll instruction
3388     __ addptr(rbx, 2);
3389     __ movptr(Address(rbp, wordSize), rbx);
3390   }
3391 
3392   __ bind(no_adjust);
3393   // Normal exit, restore registers and exit.
3394   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3395   __ ret(0);
3396 
3397 #ifdef ASSERT
3398   __ bind(bail);
3399   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3400 #endif
3401 
3402   // Make sure all code is generated
3403   masm->flush();
3404 
3405   // Fill-out other meta info
3406   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3407 }
3408 
3409 //
3410 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3411 //
3412 // Generate a stub that calls into vm to find out the proper destination
3413 // of a java call. All the argument registers are live at this point
3414 // but since this is generic code we don't know what they are and the caller
3415 // must do any gc of the args.
3416 //
3417 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3418   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3419 
3420   // allocate space for the code
3421   ResourceMark rm;
3422 
3423   CodeBuffer buffer(name, 1552, 512);
3424   MacroAssembler* masm = new MacroAssembler(&buffer);
3425 
3426   int frame_size_in_words;
3427 
3428   OopMapSet *oop_maps = new OopMapSet();
3429   OopMap* map = nullptr;
3430 
3431   int start = __ offset();
3432 
3433   // No need to save vector registers since they are caller-saved anyway.
3434   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3435 
3436   int frame_complete = __ offset();
3437 
3438   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3439 
3440   __ mov(c_rarg0, r15_thread);
3441 
3442   __ call(RuntimeAddress(destination));
3443 
3444 
3445   // Set an oopmap for the call site.
3446   // We need this not only for callee-saved registers, but also for volatile
3447   // registers that the compiler might be keeping live across a safepoint.
3448 
3449   oop_maps->add_gc_map( __ offset() - start, map);
3450 
3451   // rax contains the address we are going to jump to assuming no exception got installed
3452 
3453   // clear last_Java_sp
3454   __ reset_last_Java_frame(false);
3455   // check for pending exceptions
3456   Label pending;
3457   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3458   __ jcc(Assembler::notEqual, pending);
3459 
3460   // get the returned Method*
3461   __ get_vm_result_2(rbx, r15_thread);
3462   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3463 
3464   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3465 
3466   RegisterSaver::restore_live_registers(masm);
3467 
3468   // We are back to the original state on entry and ready to go.
3469 
3470   __ jmp(rax);
3471 
3472   // Pending exception after the safepoint
3473 
3474   __ bind(pending);
3475 
3476   RegisterSaver::restore_live_registers(masm);
3477 
3478   // exception pending => remove activation and forward to exception handler
3479 
3480   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3481 
3482   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3483   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3484 
3485   // -------------
3486   // make sure all code is generated
3487   masm->flush();
3488 
3489   // return the  blob
3490   // frame_size_words or bytes??
3491   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3492 }
3493 
3494 // Continuation point for throwing of implicit exceptions that are
3495 // not handled in the current activation. Fabricates an exception
3496 // oop and initiates normal exception dispatching in this
3497 // frame. Since we need to preserve callee-saved values (currently
3498 // only for C2, but done for C1 as well) we need a callee-saved oop
3499 // map and therefore have to make these stubs into RuntimeStubs
3500 // rather than BufferBlobs.  If the compiler needs all registers to
3501 // be preserved between the fault point and the exception handler
3502 // then it must assume responsibility for that in
3503 // AbstractCompiler::continuation_for_implicit_null_exception or
3504 // continuation_for_implicit_division_by_zero_exception. All other
3505 // implicit exceptions (e.g., NullPointerException or
3506 // AbstractMethodError on entry) are either at call sites or
3507 // otherwise assume that stack unwinding will be initiated, so
3508 // caller saved registers were assumed volatile in the compiler.
3509 RuntimeStub* SharedRuntime::generate_throw_exception(const char* name, address runtime_entry) {
3510   // Information about frame layout at time of blocking runtime call.
3511   // Note that we only have to preserve callee-saved registers since
3512   // the compilers are responsible for supplying a continuation point
3513   // if they expect all registers to be preserved.
3514   enum layout {
3515     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3516     rbp_off2,
3517     return_off,
3518     return_off2,
3519     framesize // inclusive of return address
3520   };
3521 
3522   int insts_size = 512;
3523   int locs_size  = 64;
3524 
3525   ResourceMark rm;
3526   const char* timer_msg = "SharedRuntime generate_throw_exception";
3527   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3528 
3529   CodeBuffer code(name, insts_size, locs_size);
3530   OopMapSet* oop_maps  = new OopMapSet();
3531   MacroAssembler* masm = new MacroAssembler(&code);
3532 
3533   address start = __ pc();
3534 
3535   // This is an inlined and slightly modified version of call_VM
3536   // which has the ability to fetch the return PC out of
3537   // thread-local storage and also sets up last_Java_sp slightly
3538   // differently than the real call_VM
3539 
3540   __ enter(); // required for proper stackwalking of RuntimeStub frame
3541 
3542   assert(is_even(framesize/2), "sp not 16-byte aligned");
3543 
3544   // return address and rbp are already in place
3545   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3546 
3547   int frame_complete = __ pc() - start;
3548 
3549   // Set up last_Java_sp and last_Java_fp
3550   address the_pc = __ pc();
3551   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3552   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3553 
3554   // Call runtime
3555   __ movptr(c_rarg0, r15_thread);
3556   BLOCK_COMMENT("call runtime_entry");
3557   __ call(RuntimeAddress(runtime_entry));
3558 
3559   // Generate oop map
3560   OopMap* map = new OopMap(framesize, 0);
3561 
3562   oop_maps->add_gc_map(the_pc - start, map);
3563 
3564   __ reset_last_Java_frame(true);
3565 
3566   __ leave(); // required for proper stackwalking of RuntimeStub frame
3567 
3568   // check for pending exceptions
3569 #ifdef ASSERT
3570   Label L;
3571   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3572   __ jcc(Assembler::notEqual, L);
3573   __ should_not_reach_here();
3574   __ bind(L);
3575 #endif // ASSERT
3576   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3577 
3578 
3579   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3580   RuntimeStub* stub =
3581     RuntimeStub::new_runtime_stub(name,
3582                                   &code,
3583                                   frame_complete,
3584                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3585                                   oop_maps, false);
3586   return stub;
3587 }
3588 
3589 //------------------------------Montgomery multiplication------------------------
3590 //
3591 
3592 #ifndef _WINDOWS
3593 
3594 // Subtract 0:b from carry:a.  Return carry.
3595 static julong
3596 sub(julong a[], julong b[], julong carry, long len) {
3597   long long i = 0, cnt = len;
3598   julong tmp;
3599   asm volatile("clc; "
3600                "0: ; "
3601                "mov (%[b], %[i], 8), %[tmp]; "
3602                "sbb %[tmp], (%[a], %[i], 8); "
3603                "inc %[i]; dec %[cnt]; "
3604                "jne 0b; "
3605                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3606                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3607                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3608                : "memory");
3609   return tmp;
3610 }
3611 
3612 // Multiply (unsigned) Long A by Long B, accumulating the double-
3613 // length result into the accumulator formed of T0, T1, and T2.
3614 #define MACC(A, B, T0, T1, T2)                                  \
3615 do {                                                            \
3616   unsigned long hi, lo;                                         \
3617   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3618            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3619            : "r"(A), "a"(B) : "cc");                            \
3620  } while(0)
3621 
3622 // As above, but add twice the double-length result into the
3623 // accumulator.
3624 #define MACC2(A, B, T0, T1, T2)                                 \
3625 do {                                                            \
3626   unsigned long hi, lo;                                         \
3627   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3628            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3629            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3630            : "r"(A), "a"(B) : "cc");                            \
3631  } while(0)
3632 
3633 #else //_WINDOWS
3634 
3635 static julong
3636 sub(julong a[], julong b[], julong carry, long len) {
3637   long i;
3638   julong tmp;
3639   unsigned char c = 1;
3640   for (i = 0; i < len; i++) {
3641     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3642     a[i] = tmp;
3643   }
3644   c = _addcarry_u64(c, carry, ~0, &tmp);
3645   return tmp;
3646 }
3647 
3648 // Multiply (unsigned) Long A by Long B, accumulating the double-
3649 // length result into the accumulator formed of T0, T1, and T2.
3650 #define MACC(A, B, T0, T1, T2)                          \
3651 do {                                                    \
3652   julong hi, lo;                            \
3653   lo = _umul128(A, B, &hi);                             \
3654   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3655   c = _addcarry_u64(c, hi, T1, &T1);                    \
3656   _addcarry_u64(c, T2, 0, &T2);                         \
3657  } while(0)
3658 
3659 // As above, but add twice the double-length result into the
3660 // accumulator.
3661 #define MACC2(A, B, T0, T1, T2)                         \
3662 do {                                                    \
3663   julong hi, lo;                            \
3664   lo = _umul128(A, B, &hi);                             \
3665   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3666   c = _addcarry_u64(c, hi, T1, &T1);                    \
3667   _addcarry_u64(c, T2, 0, &T2);                         \
3668   c = _addcarry_u64(0, lo, T0, &T0);                    \
3669   c = _addcarry_u64(c, hi, T1, &T1);                    \
3670   _addcarry_u64(c, T2, 0, &T2);                         \
3671  } while(0)
3672 
3673 #endif //_WINDOWS
3674 
3675 // Fast Montgomery multiplication.  The derivation of the algorithm is
3676 // in  A Cryptographic Library for the Motorola DSP56000,
3677 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3678 
3679 static void NOINLINE
3680 montgomery_multiply(julong a[], julong b[], julong n[],
3681                     julong m[], julong inv, int len) {
3682   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3683   int i;
3684 
3685   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3686 
3687   for (i = 0; i < len; i++) {
3688     int j;
3689     for (j = 0; j < i; j++) {
3690       MACC(a[j], b[i-j], t0, t1, t2);
3691       MACC(m[j], n[i-j], t0, t1, t2);
3692     }
3693     MACC(a[i], b[0], t0, t1, t2);
3694     m[i] = t0 * inv;
3695     MACC(m[i], n[0], t0, t1, t2);
3696 
3697     assert(t0 == 0, "broken Montgomery multiply");
3698 
3699     t0 = t1; t1 = t2; t2 = 0;
3700   }
3701 
3702   for (i = len; i < 2*len; i++) {
3703     int j;
3704     for (j = i-len+1; j < len; j++) {
3705       MACC(a[j], b[i-j], t0, t1, t2);
3706       MACC(m[j], n[i-j], t0, t1, t2);
3707     }
3708     m[i-len] = t0;
3709     t0 = t1; t1 = t2; t2 = 0;
3710   }
3711 
3712   while (t0)
3713     t0 = sub(m, n, t0, len);
3714 }
3715 
3716 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3717 // multiplies so it should be up to 25% faster than Montgomery
3718 // multiplication.  However, its loop control is more complex and it
3719 // may actually run slower on some machines.
3720 
3721 static void NOINLINE
3722 montgomery_square(julong a[], julong n[],
3723                   julong m[], julong inv, int len) {
3724   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3725   int i;
3726 
3727   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3728 
3729   for (i = 0; i < len; i++) {
3730     int j;
3731     int end = (i+1)/2;
3732     for (j = 0; j < end; j++) {
3733       MACC2(a[j], a[i-j], t0, t1, t2);
3734       MACC(m[j], n[i-j], t0, t1, t2);
3735     }
3736     if ((i & 1) == 0) {
3737       MACC(a[j], a[j], t0, t1, t2);
3738     }
3739     for (; j < i; j++) {
3740       MACC(m[j], n[i-j], t0, t1, t2);
3741     }
3742     m[i] = t0 * inv;
3743     MACC(m[i], n[0], t0, t1, t2);
3744 
3745     assert(t0 == 0, "broken Montgomery square");
3746 
3747     t0 = t1; t1 = t2; t2 = 0;
3748   }
3749 
3750   for (i = len; i < 2*len; i++) {
3751     int start = i-len+1;
3752     int end = start + (len - start)/2;
3753     int j;
3754     for (j = start; j < end; j++) {
3755       MACC2(a[j], a[i-j], t0, t1, t2);
3756       MACC(m[j], n[i-j], t0, t1, t2);
3757     }
3758     if ((i & 1) == 0) {
3759       MACC(a[j], a[j], t0, t1, t2);
3760     }
3761     for (; j < len; j++) {
3762       MACC(m[j], n[i-j], t0, t1, t2);
3763     }
3764     m[i-len] = t0;
3765     t0 = t1; t1 = t2; t2 = 0;
3766   }
3767 
3768   while (t0)
3769     t0 = sub(m, n, t0, len);
3770 }
3771 
3772 // Swap words in a longword.
3773 static julong swap(julong x) {
3774   return (x << 32) | (x >> 32);
3775 }
3776 
3777 // Copy len longwords from s to d, word-swapping as we go.  The
3778 // destination array is reversed.
3779 static void reverse_words(julong *s, julong *d, int len) {
3780   d += len;
3781   while(len-- > 0) {
3782     d--;
3783     *d = swap(*s);
3784     s++;
3785   }
3786 }
3787 
3788 // The threshold at which squaring is advantageous was determined
3789 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3790 #define MONTGOMERY_SQUARING_THRESHOLD 64
3791 
3792 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3793                                         jint len, jlong inv,
3794                                         jint *m_ints) {
3795   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3796   int longwords = len/2;
3797 
3798   // Make very sure we don't use so much space that the stack might
3799   // overflow.  512 jints corresponds to an 16384-bit integer and
3800   // will use here a total of 8k bytes of stack space.
3801   int divisor = sizeof(julong) * 4;
3802   guarantee(longwords <= 8192 / divisor, "must be");
3803   int total_allocation = longwords * sizeof (julong) * 4;
3804   julong *scratch = (julong *)alloca(total_allocation);
3805 
3806   // Local scratch arrays
3807   julong
3808     *a = scratch + 0 * longwords,
3809     *b = scratch + 1 * longwords,
3810     *n = scratch + 2 * longwords,
3811     *m = scratch + 3 * longwords;
3812 
3813   reverse_words((julong *)a_ints, a, longwords);
3814   reverse_words((julong *)b_ints, b, longwords);
3815   reverse_words((julong *)n_ints, n, longwords);
3816 
3817   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3818 
3819   reverse_words(m, (julong *)m_ints, longwords);
3820 }
3821 
3822 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3823                                       jint len, jlong inv,
3824                                       jint *m_ints) {
3825   assert(len % 2 == 0, "array length in montgomery_square must be even");
3826   int longwords = len/2;
3827 
3828   // Make very sure we don't use so much space that the stack might
3829   // overflow.  512 jints corresponds to an 16384-bit integer and
3830   // will use here a total of 6k bytes of stack space.
3831   int divisor = sizeof(julong) * 3;
3832   guarantee(longwords <= (8192 / divisor), "must be");
3833   int total_allocation = longwords * sizeof (julong) * 3;
3834   julong *scratch = (julong *)alloca(total_allocation);
3835 
3836   // Local scratch arrays
3837   julong
3838     *a = scratch + 0 * longwords,
3839     *n = scratch + 1 * longwords,
3840     *m = scratch + 2 * longwords;
3841 
3842   reverse_words((julong *)a_ints, a, longwords);
3843   reverse_words((julong *)n_ints, n, longwords);
3844 
3845   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3846     ::montgomery_square(a, n, m, (julong)inv, longwords);
3847   } else {
3848     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3849   }
3850 
3851   reverse_words(m, (julong *)m_ints, longwords);
3852 }
3853 
3854 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3855   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3856   CodeBuffer buffer(buf);
3857   short buffer_locs[20];
3858   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3859                                          sizeof(buffer_locs)/sizeof(relocInfo));
3860 
3861   MacroAssembler* masm = new MacroAssembler(&buffer);
3862 
3863   const Array<SigEntry>* sig_vk = vk->extended_sig();
3864   const Array<VMRegPair>* regs = vk->return_regs();
3865 
3866   int pack_fields_jobject_off = __ offset();
3867   // Resolve pre-allocated buffer from JNI handle.
3868   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3869   __ movptr(rax, Address(r13, 0));
3870   __ resolve_jobject(rax /* value */,
3871                      r15_thread /* thread */,
3872                      r12 /* tmp */);
3873   __ movptr(Address(r13, 0), rax);
3874 
3875   int pack_fields_off = __ offset();
3876 
3877   int j = 1;
3878   for (int i = 0; i < sig_vk->length(); i++) {
3879     BasicType bt = sig_vk->at(i)._bt;
3880     if (bt == T_METADATA) {
3881       continue;
3882     }
3883     if (bt == T_VOID) {
3884       if (sig_vk->at(i-1)._bt == T_LONG ||
3885           sig_vk->at(i-1)._bt == T_DOUBLE) {
3886         j++;
3887       }
3888       continue;
3889     }
3890     int off = sig_vk->at(i)._offset;
3891     assert(off > 0, "offset in object should be positive");
3892     VMRegPair pair = regs->at(j);
3893     VMReg r_1 = pair.first();
3894     VMReg r_2 = pair.second();
3895     Address to(rax, off);
3896     if (bt == T_FLOAT) {
3897       __ movflt(to, r_1->as_XMMRegister());
3898     } else if (bt == T_DOUBLE) {
3899       __ movdbl(to, r_1->as_XMMRegister());
3900     } else {
3901       Register val = r_1->as_Register();
3902       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
3903       if (is_reference_type(bt)) {
3904         __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
3905       } else {
3906         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
3907       }
3908     }
3909     j++;
3910   }
3911   assert(j == regs->length(), "missed a field?");
3912 
3913   __ ret(0);
3914 
3915   int unpack_fields_off = __ offset();
3916 
3917   Label skip;
3918   __ testptr(rax, rax);
3919   __ jcc(Assembler::zero, skip);
3920 
3921   j = 1;
3922   for (int i = 0; i < sig_vk->length(); i++) {
3923     BasicType bt = sig_vk->at(i)._bt;
3924     if (bt == T_METADATA) {
3925       continue;
3926     }
3927     if (bt == T_VOID) {
3928       if (sig_vk->at(i-1)._bt == T_LONG ||
3929           sig_vk->at(i-1)._bt == T_DOUBLE) {
3930         j++;
3931       }
3932       continue;
3933     }
3934     int off = sig_vk->at(i)._offset;
3935     assert(off > 0, "offset in object should be positive");
3936     VMRegPair pair = regs->at(j);
3937     VMReg r_1 = pair.first();
3938     VMReg r_2 = pair.second();
3939     Address from(rax, off);
3940     if (bt == T_FLOAT) {
3941       __ movflt(r_1->as_XMMRegister(), from);
3942     } else if (bt == T_DOUBLE) {
3943       __ movdbl(r_1->as_XMMRegister(), from);
3944     } else if (bt == T_OBJECT || bt == T_ARRAY) {
3945       assert_different_registers(rax, r_1->as_Register());
3946       __ load_heap_oop(r_1->as_Register(), from);
3947     } else {
3948       assert(is_java_primitive(bt), "unexpected basic type");
3949       assert_different_registers(rax, r_1->as_Register());
3950       size_t size_in_bytes = type2aelembytes(bt);
3951       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
3952     }
3953     j++;
3954   }
3955   assert(j == regs->length(), "missed a field?");
3956 
3957   __ bind(skip);
3958   __ ret(0);
3959 
3960   __ flush();
3961 
3962   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
3963 }
3964 
3965 #if INCLUDE_JFR
3966 
3967 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3968 // It returns a jobject handle to the event writer.
3969 // The handle is dereferenced and the return value is the event writer oop.
3970 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3971   enum layout {
3972     rbp_off,
3973     rbpH_off,
3974     return_off,
3975     return_off2,
3976     framesize // inclusive of return address
3977   };
3978 
3979   CodeBuffer code("jfr_write_checkpoint", 1024, 64);
3980   MacroAssembler* masm = new MacroAssembler(&code);
3981   address start = __ pc();
3982 
3983   __ enter();
3984   address the_pc = __ pc();
3985 
3986   int frame_complete = the_pc - start;
3987 
3988   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3989   __ movptr(c_rarg0, r15_thread);
3990   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3991   __ reset_last_Java_frame(true);
3992 
3993   // rax is jobject handle result, unpack and process it through a barrier.
3994   __ resolve_global_jobject(rax, r15_thread, c_rarg0);
3995 
3996   __ leave();
3997   __ ret(0);
3998 
3999   OopMapSet* oop_maps = new OopMapSet();
4000   OopMap* map = new OopMap(framesize, 1);
4001   oop_maps->add_gc_map(frame_complete, map);
4002 
4003   RuntimeStub* stub =
4004     RuntimeStub::new_runtime_stub(code.name(),
4005                                   &code,
4006                                   frame_complete,
4007                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4008                                   oop_maps,
4009                                   false);
4010   return stub;
4011 }
4012 
4013 // For c2: call to return a leased buffer.
4014 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
4015   enum layout {
4016     rbp_off,
4017     rbpH_off,
4018     return_off,
4019     return_off2,
4020     framesize // inclusive of return address
4021   };
4022 
4023   CodeBuffer code("jfr_return_lease", 1024, 64);
4024   MacroAssembler* masm = new MacroAssembler(&code);
4025   address start = __ pc();
4026 
4027   __ enter();
4028   address the_pc = __ pc();
4029 
4030   int frame_complete = the_pc - start;
4031 
4032   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
4033   __ movptr(c_rarg0, r15_thread);
4034   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
4035   __ reset_last_Java_frame(true);
4036 
4037   __ leave();
4038   __ ret(0);
4039 
4040   OopMapSet* oop_maps = new OopMapSet();
4041   OopMap* map = new OopMap(framesize, 1);
4042   oop_maps->add_gc_map(frame_complete, map);
4043 
4044   RuntimeStub* stub =
4045     RuntimeStub::new_runtime_stub(code.name(),
4046                                   &code,
4047                                   frame_complete,
4048                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4049                                   oop_maps,
4050                                   false);
4051   return stub;
4052 }
4053 
4054 #endif // INCLUDE_JFR