1 /*
   2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "code/aotCodeCache.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/timerTrace.hpp"
  56 #include "runtime/vframeArray.hpp"
  57 #include "runtime/vm_version.hpp"
  58 #include "utilities/align.hpp"
  59 #include "utilities/checkedCast.hpp"
  60 #include "utilities/formatBuffer.hpp"
  61 #include "vmreg_x86.inline.hpp"
  62 #ifdef COMPILER1
  63 #include "c1/c1_Runtime1.hpp"
  64 #endif
  65 #ifdef COMPILER2
  66 #include "opto/runtime.hpp"
  67 #endif
  68 
  69 #define __ masm->
  70 
  71 #ifdef PRODUCT
  72 #define BLOCK_COMMENT(str) /* nothing */
  73 #else
  74 #define BLOCK_COMMENT(str) __ block_comment(str)
  75 #endif // PRODUCT
  76 
  77 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  78 
  79 class RegisterSaver {
  80   // Capture info about frame layout.  Layout offsets are in jint
  81   // units because compiler frame slots are jints.
  82 #define XSAVE_AREA_BEGIN 160
  83 #define XSAVE_AREA_YMM_BEGIN 576
  84 #define XSAVE_AREA_EGPRS 960
  85 #define XSAVE_AREA_OPMASK_BEGIN 1088
  86 #define XSAVE_AREA_ZMM_BEGIN 1152
  87 #define XSAVE_AREA_UPPERBANK 1664
  88 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  89 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  90 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  91 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  92 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  93   enum layout {
  94     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  95     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
  96     DEF_XMM_OFFS(0),
  97     DEF_XMM_OFFS(1),
  98     // 2..15 are implied in range usage
  99     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 100     DEF_YMM_OFFS(0),
 101     DEF_YMM_OFFS(1),
 102     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 103     r16H_off,
 104     r17_off, r17H_off,
 105     r18_off, r18H_off,
 106     r19_off, r19H_off,
 107     r20_off, r20H_off,
 108     r21_off, r21H_off,
 109     r22_off, r22H_off,
 110     r23_off, r23H_off,
 111     r24_off, r24H_off,
 112     r25_off, r25H_off,
 113     r26_off, r26H_off,
 114     r27_off, r27H_off,
 115     r28_off, r28H_off,
 116     r29_off, r29H_off,
 117     r30_off, r30H_off,
 118     r31_off, r31H_off,
 119     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 120     DEF_OPMASK_OFFS(0),
 121     DEF_OPMASK_OFFS(1),
 122     // 2..7 are implied in range usage
 123     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 124     DEF_ZMM_OFFS(0),
 125     DEF_ZMM_OFFS(1),
 126     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 127     DEF_ZMM_UPPER_OFFS(16),
 128     DEF_ZMM_UPPER_OFFS(17),
 129     // 18..31 are implied in range usage
 130     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 131     fpu_stateH_end,
 132     r15_off, r15H_off,
 133     r14_off, r14H_off,
 134     r13_off, r13H_off,
 135     r12_off, r12H_off,
 136     r11_off, r11H_off,
 137     r10_off, r10H_off,
 138     r9_off,  r9H_off,
 139     r8_off,  r8H_off,
 140     rdi_off, rdiH_off,
 141     rsi_off, rsiH_off,
 142     ignore_off, ignoreH_off,  // extra copy of rbp
 143     rsp_off, rspH_off,
 144     rbx_off, rbxH_off,
 145     rdx_off, rdxH_off,
 146     rcx_off, rcxH_off,
 147     rax_off, raxH_off,
 148     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 149     align_off, alignH_off,
 150     flags_off, flagsH_off,
 151     // The frame sender code expects that rbp will be in the "natural" place and
 152     // will override any oopMap setting for it. We must therefore force the layout
 153     // so that it agrees with the frame sender code.
 154     rbp_off, rbpH_off,        // copy of rbp we will restore
 155     return_off, returnH_off,  // slot for return address
 156     reg_save_size             // size in compiler stack slots
 157   };
 158 
 159  public:
 160   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 161   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 162 
 163   // Offsets into the register save area
 164   // Used by deoptimization when it is managing result register
 165   // values on its own
 166 
 167   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 168   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 169   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 170   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 171   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 172   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 173 
 174   // During deoptimization only the result registers need to be restored,
 175   // all the other values have already been extracted.
 176   static void restore_result_registers(MacroAssembler* masm);
 177 };
 178 
 179 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 180   int off = 0;
 181   int num_xmm_regs = XMMRegister::available_xmm_registers();
 182 #ifdef COMPILER2
 183   if (save_wide_vectors && UseAVX == 0) {
 184     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 185   }
 186   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 187 #else
 188   save_wide_vectors = false; // vectors are generated only by C2
 189 #endif // COMPILER2
 190 
 191   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 192   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 193   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 194   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 195   // CodeBlob frame size is in words.
 196   int frame_size_in_words = frame_size_in_bytes / wordSize;
 197   *total_frame_words = frame_size_in_words;
 198 
 199   // Save registers, fpu state, and flags.
 200   // We assume caller has already pushed the return address onto the
 201   // stack, so rsp is 8-byte aligned here.
 202   // We push rpb twice in this sequence because we want the real rbp
 203   // to be under the return like a normal enter.
 204 
 205   __ enter();          // rsp becomes 16-byte aligned here
 206   __ pushf();
 207   // Make sure rsp stays 16-byte aligned
 208   __ subq(rsp, 8);
 209   // Push CPU state in multiple of 16 bytes
 210   __ save_legacy_gprs();
 211   __ push_FPU_state();
 212 
 213 
 214   // push cpu state handles this on EVEX enabled targets
 215   if (save_wide_vectors) {
 216     // Save upper half of YMM registers(0..15)
 217     int base_addr = XSAVE_AREA_YMM_BEGIN;
 218     for (int n = 0; n < 16; n++) {
 219       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 220     }
 221     if (VM_Version::supports_evex()) {
 222       // Save upper half of ZMM registers(0..15)
 223       base_addr = XSAVE_AREA_ZMM_BEGIN;
 224       for (int n = 0; n < 16; n++) {
 225         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 226       }
 227       // Save full ZMM registers(16..num_xmm_regs)
 228       base_addr = XSAVE_AREA_UPPERBANK;
 229       off = 0;
 230       int vector_len = Assembler::AVX_512bit;
 231       for (int n = 16; n < num_xmm_regs; n++) {
 232         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 233       }
 234 #ifdef COMPILER2
 235       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 236       off = 0;
 237       for(int n = 0; n < KRegister::number_of_registers; n++) {
 238         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 239       }
 240 #endif // COMPILER2
 241     }
 242   } else {
 243     if (VM_Version::supports_evex()) {
 244       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 245       int base_addr = XSAVE_AREA_UPPERBANK;
 246       off = 0;
 247       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 248       for (int n = 16; n < num_xmm_regs; n++) {
 249         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 250       }
 251 #ifdef COMPILER2
 252       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 253       off = 0;
 254       for(int n = 0; n < KRegister::number_of_registers; n++) {
 255         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 256       }
 257 #endif // COMPILER2
 258     }
 259   }
 260 
 261 #ifdef COMPILER2
 262   if (UseAPX) {
 263       int base_addr = XSAVE_AREA_EGPRS;
 264       off = 0;
 265       for (int n = 16; n < Register::number_of_registers; n++) {
 266         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 267       }
 268   }
 269 #endif // COMPILER2
 270 
 271   __ vzeroupper();
 272   if (frame::arg_reg_save_area_bytes != 0) {
 273     // Allocate argument register save area
 274     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 275   }
 276 
 277   // Set an oopmap for the call site.  This oopmap will map all
 278   // oop-registers and debug-info registers as callee-saved.  This
 279   // will allow deoptimization at this safepoint to find all possible
 280   // debug-info recordings, as well as let GC find all oops.
 281 
 282   OopMapSet *oop_maps = new OopMapSet();
 283   OopMap* map = new OopMap(frame_size_in_slots, 0);
 284 
 285 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 286 
 287   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 288   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 289   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 290   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 291   // rbp location is known implicitly by the frame sender code, needs no oopmap
 292   // and the location where rbp was saved by is ignored
 293   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 294   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 295   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 296   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 297   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 303 
 304   if (UseAPX) {
 305     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 306     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 307     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 308     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 309     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 321   }
 322   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 323   // on EVEX enabled targets, we get it included in the xsave area
 324   off = xmm0_off;
 325   int delta = xmm1_off - off;
 326   for (int n = 0; n < 16; n++) {
 327     XMMRegister xmm_name = as_XMMRegister(n);
 328     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 329     off += delta;
 330   }
 331   if (UseAVX > 2) {
 332     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 333     off = zmm16_off;
 334     delta = zmm17_off - off;
 335     for (int n = 16; n < num_xmm_regs; n++) {
 336       XMMRegister zmm_name = as_XMMRegister(n);
 337       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 338       off += delta;
 339     }
 340   }
 341 
 342 #ifdef COMPILER2
 343   if (save_wide_vectors) {
 344     // Save upper half of YMM registers(0..15)
 345     off = ymm0_off;
 346     delta = ymm1_off - ymm0_off;
 347     for (int n = 0; n < 16; n++) {
 348       XMMRegister ymm_name = as_XMMRegister(n);
 349       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 350       off += delta;
 351     }
 352     if (VM_Version::supports_evex()) {
 353       // Save upper half of ZMM registers(0..15)
 354       off = zmm0_off;
 355       delta = zmm1_off - zmm0_off;
 356       for (int n = 0; n < 16; n++) {
 357         XMMRegister zmm_name = as_XMMRegister(n);
 358         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 359         off += delta;
 360       }
 361     }
 362   }
 363 #endif // COMPILER2
 364 
 365   // %%% These should all be a waste but we'll keep things as they were for now
 366   if (true) {
 367     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 368     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 369     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 370     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 371     // rbp location is known implicitly by the frame sender code, needs no oopmap
 372     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 374     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 375     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 376     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 382     if (UseAPX) {
 383       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 384       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 385       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 386       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 387       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 399     }
 400     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 401     // on EVEX enabled targets, we get it included in the xsave area
 402     off = xmm0H_off;
 403     delta = xmm1H_off - off;
 404     for (int n = 0; n < 16; n++) {
 405       XMMRegister xmm_name = as_XMMRegister(n);
 406       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 407       off += delta;
 408     }
 409     if (UseAVX > 2) {
 410       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 411       off = zmm16H_off;
 412       delta = zmm17H_off - off;
 413       for (int n = 16; n < num_xmm_regs; n++) {
 414         XMMRegister zmm_name = as_XMMRegister(n);
 415         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 416         off += delta;
 417       }
 418     }
 419   }
 420 
 421   return map;
 422 }
 423 
 424 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 425   int num_xmm_regs = XMMRegister::available_xmm_registers();
 426   if (frame::arg_reg_save_area_bytes != 0) {
 427     // Pop arg register save area
 428     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 429   }
 430 
 431 #ifdef COMPILER2
 432   if (restore_wide_vectors) {
 433     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 434     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 435   }
 436 #else
 437   assert(!restore_wide_vectors, "vectors are generated only by C2");
 438 #endif // COMPILER2
 439 
 440   __ vzeroupper();
 441 
 442   // On EVEX enabled targets everything is handled in pop fpu state
 443   if (restore_wide_vectors) {
 444     // Restore upper half of YMM registers (0..15)
 445     int base_addr = XSAVE_AREA_YMM_BEGIN;
 446     for (int n = 0; n < 16; n++) {
 447       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 448     }
 449     if (VM_Version::supports_evex()) {
 450       // Restore upper half of ZMM registers (0..15)
 451       base_addr = XSAVE_AREA_ZMM_BEGIN;
 452       for (int n = 0; n < 16; n++) {
 453         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 454       }
 455       // Restore full ZMM registers(16..num_xmm_regs)
 456       base_addr = XSAVE_AREA_UPPERBANK;
 457       int vector_len = Assembler::AVX_512bit;
 458       int off = 0;
 459       for (int n = 16; n < num_xmm_regs; n++) {
 460         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 461       }
 462 #ifdef COMPILER2
 463       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 464       off = 0;
 465       for (int n = 0; n < KRegister::number_of_registers; n++) {
 466         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 467       }
 468 #endif // COMPILER2
 469     }
 470   } else {
 471     if (VM_Version::supports_evex()) {
 472       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 473       int base_addr = XSAVE_AREA_UPPERBANK;
 474       int off = 0;
 475       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 476       for (int n = 16; n < num_xmm_regs; n++) {
 477         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 478       }
 479 #ifdef COMPILER2
 480       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 481       off = 0;
 482       for (int n = 0; n < KRegister::number_of_registers; n++) {
 483         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 484       }
 485 #endif // COMPILER2
 486     }
 487   }
 488 
 489 #ifdef COMPILER2
 490   if (UseAPX) {
 491     int base_addr = XSAVE_AREA_EGPRS;
 492     int off = 0;
 493     for (int n = 16; n < Register::number_of_registers; n++) {
 494       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 495     }
 496   }
 497 #endif // COMPILER2
 498 
 499   // Recover CPU state
 500   __ pop_FPU_state();
 501   __ restore_legacy_gprs();
 502   __ addq(rsp, 8);
 503   __ popf();
 504   // Get the rbp described implicitly by the calling convention (no oopMap)
 505   __ pop(rbp);
 506 }
 507 
 508 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 509 
 510   // Just restore result register. Only used by deoptimization. By
 511   // now any callee save register that needs to be restored to a c2
 512   // caller of the deoptee has been extracted into the vframeArray
 513   // and will be stuffed into the c2i adapter we create for later
 514   // restoration so only result registers need to be restored here.
 515 
 516   // Restore fp result register
 517   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 518   // Restore integer result register
 519   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 520   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 521 
 522   // Pop all of the register save are off the stack except the return address
 523   __ addptr(rsp, return_offset_in_bytes());
 524 }
 525 
 526 // Is vector's size (in bytes) bigger than a size saved by default?
 527 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 528 bool SharedRuntime::is_wide_vector(int size) {
 529   return size > 16;
 530 }
 531 
 532 // ---------------------------------------------------------------------------
 533 // Read the array of BasicTypes from a signature, and compute where the
 534 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 535 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 536 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 537 // as framesizes are fixed.
 538 // VMRegImpl::stack0 refers to the first slot 0(sp).
 539 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 540 // Register up to Register::number_of_registers are the 64-bit
 541 // integer registers.
 542 
 543 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 544 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 545 // units regardless of build. Of course for i486 there is no 64 bit build
 546 
 547 // The Java calling convention is a "shifted" version of the C ABI.
 548 // By skipping the first C ABI register we can call non-static jni methods
 549 // with small numbers of arguments without having to shuffle the arguments
 550 // at all. Since we control the java ABI we ought to at least get some
 551 // advantage out of it.
 552 
 553 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 554                                            VMRegPair *regs,
 555                                            int total_args_passed) {
 556 
 557   // Create the mapping between argument positions and
 558   // registers.
 559   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 560     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 561   };
 562   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 563     j_farg0, j_farg1, j_farg2, j_farg3,
 564     j_farg4, j_farg5, j_farg6, j_farg7
 565   };
 566 
 567 
 568   uint int_args = 0;
 569   uint fp_args = 0;
 570   uint stk_args = 0;
 571 
 572   for (int i = 0; i < total_args_passed; i++) {
 573     switch (sig_bt[i]) {
 574     case T_BOOLEAN:
 575     case T_CHAR:
 576     case T_BYTE:
 577     case T_SHORT:
 578     case T_INT:
 579       if (int_args < Argument::n_int_register_parameters_j) {
 580         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 581       } else {
 582         stk_args = align_up(stk_args, 2);
 583         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 584         stk_args += 1;
 585       }
 586       break;
 587     case T_VOID:
 588       // halves of T_LONG or T_DOUBLE
 589       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 590       regs[i].set_bad();
 591       break;
 592     case T_LONG:
 593       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 594       // fall through
 595     case T_OBJECT:
 596     case T_ARRAY:
 597     case T_ADDRESS:
 598       if (int_args < Argument::n_int_register_parameters_j) {
 599         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 600       } else {
 601         stk_args = align_up(stk_args, 2);
 602         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 603         stk_args += 2;
 604       }
 605       break;
 606     case T_FLOAT:
 607       if (fp_args < Argument::n_float_register_parameters_j) {
 608         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 609       } else {
 610         stk_args = align_up(stk_args, 2);
 611         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 612         stk_args += 1;
 613       }
 614       break;
 615     case T_DOUBLE:
 616       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 617       if (fp_args < Argument::n_float_register_parameters_j) {
 618         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 619       } else {
 620         stk_args = align_up(stk_args, 2);
 621         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 622         stk_args += 2;
 623       }
 624       break;
 625     default:
 626       ShouldNotReachHere();
 627       break;
 628     }
 629   }
 630 
 631   return stk_args;
 632 }
 633 
 634 // Patch the callers callsite with entry to compiled code if it exists.
 635 static void patch_callers_callsite(MacroAssembler *masm) {
 636   Label L;
 637   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 638   __ jcc(Assembler::equal, L);
 639 
 640   // Save the current stack pointer
 641   __ mov(r13, rsp);
 642   // Schedule the branch target address early.
 643   // Call into the VM to patch the caller, then jump to compiled callee
 644   // rax isn't live so capture return address while we easily can
 645   __ movptr(rax, Address(rsp, 0));
 646 
 647   // align stack so push_CPU_state doesn't fault
 648   __ andptr(rsp, -(StackAlignmentInBytes));
 649   __ push_CPU_state();
 650   __ vzeroupper();
 651   // VM needs caller's callsite
 652   // VM needs target method
 653   // This needs to be a long call since we will relocate this adapter to
 654   // the codeBuffer and it may not reach
 655 
 656   // Allocate argument register save area
 657   if (frame::arg_reg_save_area_bytes != 0) {
 658     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 659   }
 660   __ mov(c_rarg0, rbx);
 661   __ mov(c_rarg1, rax);
 662   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 663 
 664   // De-allocate argument register save area
 665   if (frame::arg_reg_save_area_bytes != 0) {
 666     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 667   }
 668 
 669   __ vzeroupper();
 670   __ pop_CPU_state();
 671   // restore sp
 672   __ mov(rsp, r13);
 673   __ bind(L);
 674 }
 675 
 676 static void gen_c2i_adapter(MacroAssembler *masm,
 677                             int total_args_passed,
 678                             int comp_args_on_stack,
 679                             const BasicType *sig_bt,
 680                             const VMRegPair *regs,
 681                             Label& skip_fixup) {
 682   // Before we get into the guts of the C2I adapter, see if we should be here
 683   // at all.  We've come from compiled code and are attempting to jump to the
 684   // interpreter, which means the caller made a static call to get here
 685   // (vcalls always get a compiled target if there is one).  Check for a
 686   // compiled target.  If there is one, we need to patch the caller's call.
 687   patch_callers_callsite(masm);
 688 
 689   __ bind(skip_fixup);
 690 
 691   // Since all args are passed on the stack, total_args_passed *
 692   // Interpreter::stackElementSize is the space we need.
 693 
 694   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 695 
 696   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 697 
 698   // stack is aligned, keep it that way
 699   // This is not currently needed or enforced by the interpreter, but
 700   // we might as well conform to the ABI.
 701   extraspace = align_up(extraspace, 2*wordSize);
 702 
 703   // set senderSP value
 704   __ lea(r13, Address(rsp, wordSize));
 705 
 706 #ifdef ASSERT
 707   __ check_stack_alignment(r13, "sender stack not aligned");
 708 #endif
 709   if (extraspace > 0) {
 710     // Pop the return address
 711     __ pop(rax);
 712 
 713     __ subptr(rsp, extraspace);
 714 
 715     // Push the return address
 716     __ push(rax);
 717 
 718     // Account for the return address location since we store it first rather
 719     // than hold it in a register across all the shuffling
 720     extraspace += wordSize;
 721   }
 722 
 723 #ifdef ASSERT
 724   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 725 #endif
 726 
 727   // Now write the args into the outgoing interpreter space
 728   for (int i = 0; i < total_args_passed; i++) {
 729     if (sig_bt[i] == T_VOID) {
 730       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 731       continue;
 732     }
 733 
 734     // offset to start parameters
 735     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 736     int next_off = st_off - Interpreter::stackElementSize;
 737 
 738     // Say 4 args:
 739     // i   st_off
 740     // 0   32 T_LONG
 741     // 1   24 T_VOID
 742     // 2   16 T_OBJECT
 743     // 3    8 T_BOOL
 744     // -    0 return address
 745     //
 746     // However to make thing extra confusing. Because we can fit a long/double in
 747     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 748     // leaves one slot empty and only stores to a single slot. In this case the
 749     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 750 
 751     VMReg r_1 = regs[i].first();
 752     VMReg r_2 = regs[i].second();
 753     if (!r_1->is_valid()) {
 754       assert(!r_2->is_valid(), "");
 755       continue;
 756     }
 757     if (r_1->is_stack()) {
 758       // memory to memory use rax
 759       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 760       if (!r_2->is_valid()) {
 761         // sign extend??
 762         __ movl(rax, Address(rsp, ld_off));
 763         __ movptr(Address(rsp, st_off), rax);
 764 
 765       } else {
 766 
 767         __ movq(rax, Address(rsp, ld_off));
 768 
 769         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 770         // T_DOUBLE and T_LONG use two slots in the interpreter
 771         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 772           // ld_off == LSW, ld_off+wordSize == MSW
 773           // st_off == MSW, next_off == LSW
 774           __ movq(Address(rsp, next_off), rax);
 775 #ifdef ASSERT
 776           // Overwrite the unused slot with known junk
 777           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 778           __ movptr(Address(rsp, st_off), rax);
 779 #endif /* ASSERT */
 780         } else {
 781           __ movq(Address(rsp, st_off), rax);
 782         }
 783       }
 784     } else if (r_1->is_Register()) {
 785       Register r = r_1->as_Register();
 786       if (!r_2->is_valid()) {
 787         // must be only an int (or less ) so move only 32bits to slot
 788         // why not sign extend??
 789         __ movl(Address(rsp, st_off), r);
 790       } else {
 791         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 792         // T_DOUBLE and T_LONG use two slots in the interpreter
 793         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 794           // long/double in gpr
 795 #ifdef ASSERT
 796           // Overwrite the unused slot with known junk
 797           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 798           __ movptr(Address(rsp, st_off), rax);
 799 #endif /* ASSERT */
 800           __ movq(Address(rsp, next_off), r);
 801         } else {
 802           __ movptr(Address(rsp, st_off), r);
 803         }
 804       }
 805     } else {
 806       assert(r_1->is_XMMRegister(), "");
 807       if (!r_2->is_valid()) {
 808         // only a float use just part of the slot
 809         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 810       } else {
 811 #ifdef ASSERT
 812         // Overwrite the unused slot with known junk
 813         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 814         __ movptr(Address(rsp, st_off), rax);
 815 #endif /* ASSERT */
 816         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 817       }
 818     }
 819   }
 820 
 821   // Schedule the branch target address early.
 822   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 823   __ jmp(rcx);
 824 }
 825 
 826 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 827                                     int total_args_passed,
 828                                     int comp_args_on_stack,
 829                                     const BasicType *sig_bt,
 830                                     const VMRegPair *regs) {
 831 
 832   // Note: r13 contains the senderSP on entry. We must preserve it since
 833   // we may do a i2c -> c2i transition if we lose a race where compiled
 834   // code goes non-entrant while we get args ready.
 835   // In addition we use r13 to locate all the interpreter args as
 836   // we must align the stack to 16 bytes on an i2c entry else we
 837   // lose alignment we expect in all compiled code and register
 838   // save code can segv when fxsave instructions find improperly
 839   // aligned stack pointer.
 840 
 841   // Adapters can be frameless because they do not require the caller
 842   // to perform additional cleanup work, such as correcting the stack pointer.
 843   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 844   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 845   // even if a callee has modified the stack pointer.
 846   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 847   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 848   // up via the senderSP register).
 849   // In other words, if *either* the caller or callee is interpreted, we can
 850   // get the stack pointer repaired after a call.
 851   // This is why c2i and i2c adapters cannot be indefinitely composed.
 852   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 853   // both caller and callee would be compiled methods, and neither would
 854   // clean up the stack pointer changes performed by the two adapters.
 855   // If this happens, control eventually transfers back to the compiled
 856   // caller, but with an uncorrected stack, causing delayed havoc.
 857 
 858   // Must preserve original SP for loading incoming arguments because
 859   // we need to align the outgoing SP for compiled code.
 860   __ movptr(r11, rsp);
 861 
 862   // Pick up the return address
 863   __ pop(rax);
 864 
 865   // Convert 4-byte c2 stack slots to words.
 866   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 867 
 868   if (comp_args_on_stack) {
 869     __ subptr(rsp, comp_words_on_stack * wordSize);
 870   }
 871 
 872   // Ensure compiled code always sees stack at proper alignment
 873   __ andptr(rsp, -16);
 874 
 875   // push the return address and misalign the stack that youngest frame always sees
 876   // as far as the placement of the call instruction
 877   __ push(rax);
 878 
 879   // Put saved SP in another register
 880   const Register saved_sp = rax;
 881   __ movptr(saved_sp, r11);
 882 
 883   // Will jump to the compiled code just as if compiled code was doing it.
 884   // Pre-load the register-jump target early, to schedule it better.
 885   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 886 
 887   // Now generate the shuffle code.  Pick up all register args and move the
 888   // rest through the floating point stack top.
 889   for (int i = 0; i < total_args_passed; i++) {
 890     if (sig_bt[i] == T_VOID) {
 891       // Longs and doubles are passed in native word order, but misaligned
 892       // in the 32-bit build.
 893       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 894       continue;
 895     }
 896 
 897     // Pick up 0, 1 or 2 words from SP+offset.
 898 
 899     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 900             "scrambled load targets?");
 901     // Load in argument order going down.
 902     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 903     // Point to interpreter value (vs. tag)
 904     int next_off = ld_off - Interpreter::stackElementSize;
 905     //
 906     //
 907     //
 908     VMReg r_1 = regs[i].first();
 909     VMReg r_2 = regs[i].second();
 910     if (!r_1->is_valid()) {
 911       assert(!r_2->is_valid(), "");
 912       continue;
 913     }
 914     if (r_1->is_stack()) {
 915       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 916       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 917 
 918       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 919       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 920       // will be generated.
 921       if (!r_2->is_valid()) {
 922         // sign extend???
 923         __ movl(r13, Address(saved_sp, ld_off));
 924         __ movptr(Address(rsp, st_off), r13);
 925       } else {
 926         //
 927         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 928         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 929         // So we must adjust where to pick up the data to match the interpreter.
 930         //
 931         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 932         // are accessed as negative so LSW is at LOW address
 933 
 934         // ld_off is MSW so get LSW
 935         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 936                            next_off : ld_off;
 937         __ movq(r13, Address(saved_sp, offset));
 938         // st_off is LSW (i.e. reg.first())
 939         __ movq(Address(rsp, st_off), r13);
 940       }
 941     } else if (r_1->is_Register()) {  // Register argument
 942       Register r = r_1->as_Register();
 943       assert(r != rax, "must be different");
 944       if (r_2->is_valid()) {
 945         //
 946         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 947         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 948         // So we must adjust where to pick up the data to match the interpreter.
 949 
 950         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 951                            next_off : ld_off;
 952 
 953         // this can be a misaligned move
 954         __ movq(r, Address(saved_sp, offset));
 955       } else {
 956         // sign extend and use a full word?
 957         __ movl(r, Address(saved_sp, ld_off));
 958       }
 959     } else {
 960       if (!r_2->is_valid()) {
 961         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 962       } else {
 963         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 964       }
 965     }
 966   }
 967 
 968   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 969 
 970   // 6243940 We might end up in handle_wrong_method if
 971   // the callee is deoptimized as we race thru here. If that
 972   // happens we don't want to take a safepoint because the
 973   // caller frame will look interpreted and arguments are now
 974   // "compiled" so it is much better to make this transition
 975   // invisible to the stack walking code. Unfortunately if
 976   // we try and find the callee by normal means a safepoint
 977   // is possible. So we stash the desired callee in the thread
 978   // and the vm will find there should this case occur.
 979 
 980   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 981 
 982   // put Method* where a c2i would expect should we end up there
 983   // only needed because eof c2 resolve stubs return Method* as a result in
 984   // rax
 985   __ mov(rax, rbx);
 986   __ jmp(r11);
 987 }
 988 
 989 // ---------------------------------------------------------------
 990 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 991                                             int total_args_passed,
 992                                             int comp_args_on_stack,
 993                                             const BasicType *sig_bt,
 994                                             const VMRegPair *regs,
 995                                             address entry_address[AdapterBlob::ENTRY_COUNT]) {
 996   entry_address[AdapterBlob::I2C] = __ pc();
 997 
 998   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 999 
1000   // -------------------------------------------------------------------------
1001   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1002   // to the interpreter.  The args start out packed in the compiled layout.  They
1003   // need to be unpacked into the interpreter layout.  This will almost always
1004   // require some stack space.  We grow the current (compiled) stack, then repack
1005   // the args.  We  finally end in a jump to the generic interpreter entry point.
1006   // On exit from the interpreter, the interpreter will restore our SP (lest the
1007   // compiled code, which relies solely on SP and not RBP, get sick).
1008 
1009   entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1010   Label skip_fixup;
1011 
1012   Register data = rax;
1013   Register receiver = j_rarg0;
1014   Register temp = rbx;
1015 
1016   {
1017     __ ic_check(1 /* end_alignment */);
1018     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1019     // Method might have been compiled since the call site was patched to
1020     // interpreted if that is the case treat it as a miss so we can get
1021     // the call site corrected.
1022     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1023     __ jcc(Assembler::equal, skip_fixup);
1024     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1025   }
1026 
1027   entry_address[AdapterBlob::C2I] = __ pc();
1028 
1029   // Class initialization barrier for static methods
1030   entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1031   assert(VM_Version::supports_fast_class_init_checks(), "sanity");
1032   Label L_skip_barrier;
1033   Register method = rbx;
1034 
1035   // Bypass the barrier for non-static methods
1036   Register flags = rscratch1;
1037   __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1038   __ testl(flags, JVM_ACC_STATIC);
1039   __ jcc(Assembler::zero, L_skip_barrier); // non-static
1040 
1041   Register klass = rscratch1;
1042   __ load_method_holder(klass, method);
1043   __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1044 
1045   __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1046 
1047   __ bind(L_skip_barrier);
1048   entry_address[AdapterBlob::C2I_No_Clinit_Check] = __ pc();
1049 
1050   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1051   bs->c2i_entry_barrier(masm);
1052 
1053   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1054   return;
1055 }
1056 
1057 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1058                                          VMRegPair *regs,
1059                                          int total_args_passed) {
1060 
1061 // We return the amount of VMRegImpl stack slots we need to reserve for all
1062 // the arguments NOT counting out_preserve_stack_slots.
1063 
1064 // NOTE: These arrays will have to change when c1 is ported
1065 #ifdef _WIN64
1066     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1067       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1068     };
1069     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1070       c_farg0, c_farg1, c_farg2, c_farg3
1071     };
1072 #else
1073     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1074       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1075     };
1076     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1077       c_farg0, c_farg1, c_farg2, c_farg3,
1078       c_farg4, c_farg5, c_farg6, c_farg7
1079     };
1080 #endif // _WIN64
1081 
1082 
1083     uint int_args = 0;
1084     uint fp_args = 0;
1085     uint stk_args = 0; // inc by 2 each time
1086 
1087     for (int i = 0; i < total_args_passed; i++) {
1088       switch (sig_bt[i]) {
1089       case T_BOOLEAN:
1090       case T_CHAR:
1091       case T_BYTE:
1092       case T_SHORT:
1093       case T_INT:
1094         if (int_args < Argument::n_int_register_parameters_c) {
1095           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1096 #ifdef _WIN64
1097           fp_args++;
1098           // Allocate slots for callee to stuff register args the stack.
1099           stk_args += 2;
1100 #endif
1101         } else {
1102           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1103           stk_args += 2;
1104         }
1105         break;
1106       case T_LONG:
1107         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1108         // fall through
1109       case T_OBJECT:
1110       case T_ARRAY:
1111       case T_ADDRESS:
1112       case T_METADATA:
1113         if (int_args < Argument::n_int_register_parameters_c) {
1114           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1115 #ifdef _WIN64
1116           fp_args++;
1117           stk_args += 2;
1118 #endif
1119         } else {
1120           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1121           stk_args += 2;
1122         }
1123         break;
1124       case T_FLOAT:
1125         if (fp_args < Argument::n_float_register_parameters_c) {
1126           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1127 #ifdef _WIN64
1128           int_args++;
1129           // Allocate slots for callee to stuff register args the stack.
1130           stk_args += 2;
1131 #endif
1132         } else {
1133           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1134           stk_args += 2;
1135         }
1136         break;
1137       case T_DOUBLE:
1138         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1139         if (fp_args < Argument::n_float_register_parameters_c) {
1140           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1141 #ifdef _WIN64
1142           int_args++;
1143           // Allocate slots for callee to stuff register args the stack.
1144           stk_args += 2;
1145 #endif
1146         } else {
1147           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1148           stk_args += 2;
1149         }
1150         break;
1151       case T_VOID: // Halves of longs and doubles
1152         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1153         regs[i].set_bad();
1154         break;
1155       default:
1156         ShouldNotReachHere();
1157         break;
1158       }
1159     }
1160 #ifdef _WIN64
1161   // windows abi requires that we always allocate enough stack space
1162   // for 4 64bit registers to be stored down.
1163   if (stk_args < 8) {
1164     stk_args = 8;
1165   }
1166 #endif // _WIN64
1167 
1168   return stk_args;
1169 }
1170 
1171 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1172                                              uint num_bits,
1173                                              uint total_args_passed) {
1174   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1175          "only certain vector sizes are supported for now");
1176 
1177   static const XMMRegister VEC_ArgReg[32] = {
1178      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1179      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1180     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1181     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1182   };
1183 
1184   uint stk_args = 0;
1185   uint fp_args = 0;
1186 
1187   for (uint i = 0; i < total_args_passed; i++) {
1188     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1189     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1190     regs[i].set_pair(vmreg->next(next_val), vmreg);
1191   }
1192 
1193   return stk_args;
1194 }
1195 
1196 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1197   // We always ignore the frame_slots arg and just use the space just below frame pointer
1198   // which by this time is free to use
1199   switch (ret_type) {
1200   case T_FLOAT:
1201     __ movflt(Address(rbp, -wordSize), xmm0);
1202     break;
1203   case T_DOUBLE:
1204     __ movdbl(Address(rbp, -wordSize), xmm0);
1205     break;
1206   case T_VOID:  break;
1207   default: {
1208     __ movptr(Address(rbp, -wordSize), rax);
1209     }
1210   }
1211 }
1212 
1213 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1214   // We always ignore the frame_slots arg and just use the space just below frame pointer
1215   // which by this time is free to use
1216   switch (ret_type) {
1217   case T_FLOAT:
1218     __ movflt(xmm0, Address(rbp, -wordSize));
1219     break;
1220   case T_DOUBLE:
1221     __ movdbl(xmm0, Address(rbp, -wordSize));
1222     break;
1223   case T_VOID:  break;
1224   default: {
1225     __ movptr(rax, Address(rbp, -wordSize));
1226     }
1227   }
1228 }
1229 
1230 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1231     for ( int i = first_arg ; i < arg_count ; i++ ) {
1232       if (args[i].first()->is_Register()) {
1233         __ push(args[i].first()->as_Register());
1234       } else if (args[i].first()->is_XMMRegister()) {
1235         __ subptr(rsp, 2*wordSize);
1236         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1237       }
1238     }
1239 }
1240 
1241 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1242     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1243       if (args[i].first()->is_Register()) {
1244         __ pop(args[i].first()->as_Register());
1245       } else if (args[i].first()->is_XMMRegister()) {
1246         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1247         __ addptr(rsp, 2*wordSize);
1248       }
1249     }
1250 }
1251 
1252 static void verify_oop_args(MacroAssembler* masm,
1253                             const methodHandle& method,
1254                             const BasicType* sig_bt,
1255                             const VMRegPair* regs) {
1256   Register temp_reg = rbx;  // not part of any compiled calling seq
1257   if (VerifyOops) {
1258     for (int i = 0; i < method->size_of_parameters(); i++) {
1259       if (is_reference_type(sig_bt[i])) {
1260         VMReg r = regs[i].first();
1261         assert(r->is_valid(), "bad oop arg");
1262         if (r->is_stack()) {
1263           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1264           __ verify_oop(temp_reg);
1265         } else {
1266           __ verify_oop(r->as_Register());
1267         }
1268       }
1269     }
1270   }
1271 }
1272 
1273 static void check_continuation_enter_argument(VMReg actual_vmreg,
1274                                               Register expected_reg,
1275                                               const char* name) {
1276   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1277   assert(actual_vmreg->as_Register() == expected_reg,
1278          "%s is in unexpected register: %s instead of %s",
1279          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1280 }
1281 
1282 
1283 //---------------------------- continuation_enter_setup ---------------------------
1284 //
1285 // Arguments:
1286 //   None.
1287 //
1288 // Results:
1289 //   rsp: pointer to blank ContinuationEntry
1290 //
1291 // Kills:
1292 //   rax
1293 //
1294 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1295   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1296   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1297   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1298 
1299   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1300   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1301 
1302   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1303   OopMap* map = new OopMap(frame_size, 0);
1304 
1305   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1306   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1307   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1308 
1309   return map;
1310 }
1311 
1312 //---------------------------- fill_continuation_entry ---------------------------
1313 //
1314 // Arguments:
1315 //   rsp: pointer to blank Continuation entry
1316 //   reg_cont_obj: pointer to the continuation
1317 //   reg_flags: flags
1318 //
1319 // Results:
1320 //   rsp: pointer to filled out ContinuationEntry
1321 //
1322 // Kills:
1323 //   rax
1324 //
1325 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1326   assert_different_registers(rax, reg_cont_obj, reg_flags);
1327 #ifdef ASSERT
1328   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1329 #endif
1330   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1331   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1332   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1333   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1334   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1335 
1336   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1337   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1338 
1339   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1340 }
1341 
1342 //---------------------------- continuation_enter_cleanup ---------------------------
1343 //
1344 // Arguments:
1345 //   rsp: pointer to the ContinuationEntry
1346 //
1347 // Results:
1348 //   rsp: pointer to the spilled rbp in the entry frame
1349 //
1350 // Kills:
1351 //   rbx
1352 //
1353 static void continuation_enter_cleanup(MacroAssembler* masm) {
1354 #ifdef ASSERT
1355   Label L_good_sp;
1356   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1357   __ jcc(Assembler::equal, L_good_sp);
1358   __ stop("Incorrect rsp at continuation_enter_cleanup");
1359   __ bind(L_good_sp);
1360 #endif
1361   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1362   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1363   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1364   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1365   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1366 }
1367 
1368 static void gen_continuation_enter(MacroAssembler* masm,
1369                                    const VMRegPair* regs,
1370                                    int& exception_offset,
1371                                    OopMapSet* oop_maps,
1372                                    int& frame_complete,
1373                                    int& stack_slots,
1374                                    int& interpreted_entry_offset,
1375                                    int& compiled_entry_offset) {
1376 
1377   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1378   int pos_cont_obj   = 0;
1379   int pos_is_cont    = 1;
1380   int pos_is_virtual = 2;
1381 
1382   // The platform-specific calling convention may present the arguments in various registers.
1383   // To simplify the rest of the code, we expect the arguments to reside at these known
1384   // registers, and we additionally check the placement here in case calling convention ever
1385   // changes.
1386   Register reg_cont_obj   = c_rarg1;
1387   Register reg_is_cont    = c_rarg2;
1388   Register reg_is_virtual = c_rarg3;
1389 
1390   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1391   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1392   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1393 
1394   // Utility methods kill rax, make sure there are no collisions
1395   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1396 
1397   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1398                          relocInfo::static_call_type);
1399 
1400   address start = __ pc();
1401 
1402   Label L_thaw, L_exit;
1403 
1404   // i2i entry used at interp_only_mode only
1405   interpreted_entry_offset = __ pc() - start;
1406   {
1407 #ifdef ASSERT
1408     Label is_interp_only;
1409     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1410     __ jcc(Assembler::notEqual, is_interp_only);
1411     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1412     __ bind(is_interp_only);
1413 #endif
1414 
1415     __ pop(rax); // return address
1416     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1417     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1418     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1419     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1420     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1421     __ push(rax); // return address
1422     __ push_cont_fastpath();
1423 
1424     __ enter();
1425 
1426     stack_slots = 2; // will be adjusted in setup
1427     OopMap* map = continuation_enter_setup(masm, stack_slots);
1428     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1429     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1430 
1431     __ verify_oop(reg_cont_obj);
1432 
1433     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1434 
1435     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1436     __ testptr(reg_is_cont, reg_is_cont);
1437     __ jcc(Assembler::notZero, L_thaw);
1438 
1439     // --- Resolve path
1440 
1441     // Make sure the call is patchable
1442     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1443     // Emit stub for static call
1444     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1445     if (stub == nullptr) {
1446       fatal("CodeCache is full at gen_continuation_enter");
1447     }
1448     __ call(resolve);
1449     oop_maps->add_gc_map(__ pc() - start, map);
1450     __ post_call_nop();
1451 
1452     __ jmp(L_exit);
1453   }
1454 
1455   // compiled entry
1456   __ align(CodeEntryAlignment);
1457   compiled_entry_offset = __ pc() - start;
1458   __ enter();
1459 
1460   stack_slots = 2; // will be adjusted in setup
1461   OopMap* map = continuation_enter_setup(masm, stack_slots);
1462 
1463   // Frame is now completed as far as size and linkage.
1464   frame_complete = __ pc() - start;
1465 
1466   __ verify_oop(reg_cont_obj);
1467 
1468   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1469 
1470   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1471   __ testptr(reg_is_cont, reg_is_cont);
1472   __ jccb(Assembler::notZero, L_thaw);
1473 
1474   // --- call Continuation.enter(Continuation c, boolean isContinue)
1475 
1476   // Make sure the call is patchable
1477   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1478 
1479   // Emit stub for static call
1480   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1481   if (stub == nullptr) {
1482     fatal("CodeCache is full at gen_continuation_enter");
1483   }
1484 
1485   // The call needs to be resolved. There's a special case for this in
1486   // SharedRuntime::find_callee_info_helper() which calls
1487   // LinkResolver::resolve_continuation_enter() which resolves the call to
1488   // Continuation.enter(Continuation c, boolean isContinue).
1489   __ call(resolve);
1490 
1491   oop_maps->add_gc_map(__ pc() - start, map);
1492   __ post_call_nop();
1493 
1494   __ jmpb(L_exit);
1495 
1496   // --- Thawing path
1497 
1498   __ bind(L_thaw);
1499 
1500   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1501   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1502 
1503   ContinuationEntry::_return_pc_offset = __ pc() - start;
1504   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1505   __ post_call_nop();
1506 
1507   // --- Normal exit (resolve/thawing)
1508 
1509   __ bind(L_exit);
1510   ContinuationEntry::_cleanup_offset = __ pc() - start;
1511   continuation_enter_cleanup(masm);
1512   __ pop(rbp);
1513   __ ret(0);
1514 
1515   // --- Exception handling path
1516 
1517   exception_offset = __ pc() - start;
1518 
1519   continuation_enter_cleanup(masm);
1520   __ pop(rbp);
1521 
1522   __ movptr(c_rarg0, r15_thread);
1523   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1524 
1525   // rax still holds the original exception oop, save it before the call
1526   __ push(rax);
1527 
1528   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1529   __ movptr(rbx, rax);
1530 
1531   // Continue at exception handler:
1532   //   rax: exception oop
1533   //   rbx: exception handler
1534   //   rdx: exception pc
1535   __ pop(rax);
1536   __ verify_oop(rax);
1537   __ pop(rdx);
1538   __ jmp(rbx);
1539 }
1540 
1541 static void gen_continuation_yield(MacroAssembler* masm,
1542                                    const VMRegPair* regs,
1543                                    OopMapSet* oop_maps,
1544                                    int& frame_complete,
1545                                    int& stack_slots,
1546                                    int& compiled_entry_offset) {
1547   enum layout {
1548     rbp_off,
1549     rbpH_off,
1550     return_off,
1551     return_off2,
1552     framesize // inclusive of return address
1553   };
1554   stack_slots = framesize /  VMRegImpl::slots_per_word;
1555   assert(stack_slots == 2, "recheck layout");
1556 
1557   address start = __ pc();
1558   compiled_entry_offset = __ pc() - start;
1559   __ enter();
1560   address the_pc = __ pc();
1561 
1562   frame_complete = the_pc - start;
1563 
1564   // This nop must be exactly at the PC we push into the frame info.
1565   // We use this nop for fast CodeBlob lookup, associate the OopMap
1566   // with it right away.
1567   __ post_call_nop();
1568   OopMap* map = new OopMap(framesize, 1);
1569   oop_maps->add_gc_map(frame_complete, map);
1570 
1571   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1572   __ movptr(c_rarg0, r15_thread);
1573   __ movptr(c_rarg1, rsp);
1574   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1575   __ reset_last_Java_frame(true);
1576 
1577   Label L_pinned;
1578 
1579   __ testptr(rax, rax);
1580   __ jcc(Assembler::notZero, L_pinned);
1581 
1582   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1583   continuation_enter_cleanup(masm);
1584   __ pop(rbp);
1585   __ ret(0);
1586 
1587   __ bind(L_pinned);
1588 
1589   // Pinned, return to caller
1590 
1591   // handle pending exception thrown by freeze
1592   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1593   Label ok;
1594   __ jcc(Assembler::equal, ok);
1595   __ leave();
1596   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1597   __ bind(ok);
1598 
1599   __ leave();
1600   __ ret(0);
1601 }
1602 
1603 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1604   ::continuation_enter_cleanup(masm);
1605 }
1606 
1607 static void gen_special_dispatch(MacroAssembler* masm,
1608                                  const methodHandle& method,
1609                                  const BasicType* sig_bt,
1610                                  const VMRegPair* regs) {
1611   verify_oop_args(masm, method, sig_bt, regs);
1612   vmIntrinsics::ID iid = method->intrinsic_id();
1613 
1614   // Now write the args into the outgoing interpreter space
1615   bool     has_receiver   = false;
1616   Register receiver_reg   = noreg;
1617   int      member_arg_pos = -1;
1618   Register member_reg     = noreg;
1619   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1620   if (ref_kind != 0) {
1621     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1622     member_reg = rbx;  // known to be free at this point
1623     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1624   } else if (iid == vmIntrinsics::_invokeBasic) {
1625     has_receiver = true;
1626   } else if (iid == vmIntrinsics::_linkToNative) {
1627     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1628     member_reg = rbx;  // known to be free at this point
1629   } else {
1630     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1631   }
1632 
1633   if (member_reg != noreg) {
1634     // Load the member_arg into register, if necessary.
1635     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1636     VMReg r = regs[member_arg_pos].first();
1637     if (r->is_stack()) {
1638       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1639     } else {
1640       // no data motion is needed
1641       member_reg = r->as_Register();
1642     }
1643   }
1644 
1645   if (has_receiver) {
1646     // Make sure the receiver is loaded into a register.
1647     assert(method->size_of_parameters() > 0, "oob");
1648     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1649     VMReg r = regs[0].first();
1650     assert(r->is_valid(), "bad receiver arg");
1651     if (r->is_stack()) {
1652       // Porting note:  This assumes that compiled calling conventions always
1653       // pass the receiver oop in a register.  If this is not true on some
1654       // platform, pick a temp and load the receiver from stack.
1655       fatal("receiver always in a register");
1656       receiver_reg = j_rarg0;  // known to be free at this point
1657       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1658     } else {
1659       // no data motion is needed
1660       receiver_reg = r->as_Register();
1661     }
1662   }
1663 
1664   // Figure out which address we are really jumping to:
1665   MethodHandles::generate_method_handle_dispatch(masm, iid,
1666                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1667 }
1668 
1669 // ---------------------------------------------------------------------------
1670 // Generate a native wrapper for a given method.  The method takes arguments
1671 // in the Java compiled code convention, marshals them to the native
1672 // convention (handlizes oops, etc), transitions to native, makes the call,
1673 // returns to java state (possibly blocking), unhandlizes any result and
1674 // returns.
1675 //
1676 // Critical native functions are a shorthand for the use of
1677 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1678 // functions.  The wrapper is expected to unpack the arguments before
1679 // passing them to the callee. Critical native functions leave the state _in_Java,
1680 // since they cannot stop for GC.
1681 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1682 // block and the check for pending exceptions it's impossible for them
1683 // to be thrown.
1684 //
1685 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1686                                                 const methodHandle& method,
1687                                                 int compile_id,
1688                                                 BasicType* in_sig_bt,
1689                                                 VMRegPair* in_regs,
1690                                                 BasicType ret_type) {
1691   if (method->is_continuation_native_intrinsic()) {
1692     int exception_offset = -1;
1693     OopMapSet* oop_maps = new OopMapSet();
1694     int frame_complete = -1;
1695     int stack_slots = -1;
1696     int interpreted_entry_offset = -1;
1697     int vep_offset = -1;
1698     if (method->is_continuation_enter_intrinsic()) {
1699       gen_continuation_enter(masm,
1700                              in_regs,
1701                              exception_offset,
1702                              oop_maps,
1703                              frame_complete,
1704                              stack_slots,
1705                              interpreted_entry_offset,
1706                              vep_offset);
1707     } else if (method->is_continuation_yield_intrinsic()) {
1708       gen_continuation_yield(masm,
1709                              in_regs,
1710                              oop_maps,
1711                              frame_complete,
1712                              stack_slots,
1713                              vep_offset);
1714     } else {
1715       guarantee(false, "Unknown Continuation native intrinsic");
1716     }
1717 
1718 #ifdef ASSERT
1719     if (method->is_continuation_enter_intrinsic()) {
1720       assert(interpreted_entry_offset != -1, "Must be set");
1721       assert(exception_offset != -1,         "Must be set");
1722     } else {
1723       assert(interpreted_entry_offset == -1, "Must be unset");
1724       assert(exception_offset == -1,         "Must be unset");
1725     }
1726     assert(frame_complete != -1,    "Must be set");
1727     assert(stack_slots != -1,       "Must be set");
1728     assert(vep_offset != -1,        "Must be set");
1729 #endif
1730 
1731     __ flush();
1732     nmethod* nm = nmethod::new_native_nmethod(method,
1733                                               compile_id,
1734                                               masm->code(),
1735                                               vep_offset,
1736                                               frame_complete,
1737                                               stack_slots,
1738                                               in_ByteSize(-1),
1739                                               in_ByteSize(-1),
1740                                               oop_maps,
1741                                               exception_offset);
1742     if (nm == nullptr) return nm;
1743     if (method->is_continuation_enter_intrinsic()) {
1744       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1745     } else if (method->is_continuation_yield_intrinsic()) {
1746       _cont_doYield_stub = nm;
1747     }
1748     return nm;
1749   }
1750 
1751   if (method->is_method_handle_intrinsic()) {
1752     vmIntrinsics::ID iid = method->intrinsic_id();
1753     intptr_t start = (intptr_t)__ pc();
1754     int vep_offset = ((intptr_t)__ pc()) - start;
1755     gen_special_dispatch(masm,
1756                          method,
1757                          in_sig_bt,
1758                          in_regs);
1759     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1760     __ flush();
1761     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1762     return nmethod::new_native_nmethod(method,
1763                                        compile_id,
1764                                        masm->code(),
1765                                        vep_offset,
1766                                        frame_complete,
1767                                        stack_slots / VMRegImpl::slots_per_word,
1768                                        in_ByteSize(-1),
1769                                        in_ByteSize(-1),
1770                                        nullptr);
1771   }
1772   address native_func = method->native_function();
1773   assert(native_func != nullptr, "must have function");
1774 
1775   // An OopMap for lock (and class if static)
1776   OopMapSet *oop_maps = new OopMapSet();
1777   intptr_t start = (intptr_t)__ pc();
1778 
1779   // We have received a description of where all the java arg are located
1780   // on entry to the wrapper. We need to convert these args to where
1781   // the jni function will expect them. To figure out where they go
1782   // we convert the java signature to a C signature by inserting
1783   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1784 
1785   const int total_in_args = method->size_of_parameters();
1786   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1787 
1788   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1789   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1790 
1791   int argc = 0;
1792   out_sig_bt[argc++] = T_ADDRESS;
1793   if (method->is_static()) {
1794     out_sig_bt[argc++] = T_OBJECT;
1795   }
1796 
1797   for (int i = 0; i < total_in_args ; i++ ) {
1798     out_sig_bt[argc++] = in_sig_bt[i];
1799   }
1800 
1801   // Now figure out where the args must be stored and how much stack space
1802   // they require.
1803   int out_arg_slots;
1804   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1805 
1806   // Compute framesize for the wrapper.  We need to handlize all oops in
1807   // incoming registers
1808 
1809   // Calculate the total number of stack slots we will need.
1810 
1811   // First count the abi requirement plus all of the outgoing args
1812   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1813 
1814   // Now the space for the inbound oop handle area
1815   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1816 
1817   int oop_handle_offset = stack_slots;
1818   stack_slots += total_save_slots;
1819 
1820   // Now any space we need for handlizing a klass if static method
1821 
1822   int klass_slot_offset = 0;
1823   int klass_offset = -1;
1824   int lock_slot_offset = 0;
1825   bool is_static = false;
1826 
1827   if (method->is_static()) {
1828     klass_slot_offset = stack_slots;
1829     stack_slots += VMRegImpl::slots_per_word;
1830     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1831     is_static = true;
1832   }
1833 
1834   // Plus a lock if needed
1835 
1836   if (method->is_synchronized()) {
1837     lock_slot_offset = stack_slots;
1838     stack_slots += VMRegImpl::slots_per_word;
1839   }
1840 
1841   // Now a place (+2) to save return values or temp during shuffling
1842   // + 4 for return address (which we own) and saved rbp
1843   stack_slots += 6;
1844 
1845   // Ok The space we have allocated will look like:
1846   //
1847   //
1848   // FP-> |                     |
1849   //      |---------------------|
1850   //      | 2 slots for moves   |
1851   //      |---------------------|
1852   //      | lock box (if sync)  |
1853   //      |---------------------| <- lock_slot_offset
1854   //      | klass (if static)   |
1855   //      |---------------------| <- klass_slot_offset
1856   //      | oopHandle area      |
1857   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1858   //      | outbound memory     |
1859   //      | based arguments     |
1860   //      |                     |
1861   //      |---------------------|
1862   //      |                     |
1863   // SP-> | out_preserved_slots |
1864   //
1865   //
1866 
1867 
1868   // Now compute actual number of stack words we need rounding to make
1869   // stack properly aligned.
1870   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1871 
1872   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1873 
1874   // First thing make an ic check to see if we should even be here
1875 
1876   // We are free to use all registers as temps without saving them and
1877   // restoring them except rbp. rbp is the only callee save register
1878   // as far as the interpreter and the compiler(s) are concerned.
1879 
1880   const Register receiver = j_rarg0;
1881 
1882   Label exception_pending;
1883 
1884   assert_different_registers(receiver, rscratch1, rscratch2);
1885   __ verify_oop(receiver);
1886   __ ic_check(8 /* end_alignment */);
1887 
1888   int vep_offset = ((intptr_t)__ pc()) - start;
1889 
1890   if (method->needs_clinit_barrier()) {
1891     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
1892     Label L_skip_barrier;
1893     Register klass = r10;
1894     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1895     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1896 
1897     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1898 
1899     __ bind(L_skip_barrier);
1900   }
1901 
1902 #ifdef COMPILER1
1903   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1904   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1905     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1906   }
1907 #endif // COMPILER1
1908 
1909   // The instruction at the verified entry point must be 5 bytes or longer
1910   // because it can be patched on the fly by make_non_entrant. The stack bang
1911   // instruction fits that requirement.
1912 
1913   // Generate stack overflow check
1914   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1915 
1916   // Generate a new frame for the wrapper.
1917   __ enter();
1918   // -2 because return address is already present and so is saved rbp
1919   __ subptr(rsp, stack_size - 2*wordSize);
1920 
1921   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1922   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1923   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1924 
1925   // Frame is now completed as far as size and linkage.
1926   int frame_complete = ((intptr_t)__ pc()) - start;
1927 
1928 #ifdef ASSERT
1929   __ check_stack_alignment(rsp, "improperly aligned stack");
1930 #endif /* ASSERT */
1931 
1932 
1933   // We use r14 as the oop handle for the receiver/klass
1934   // It is callee save so it survives the call to native
1935 
1936   const Register oop_handle_reg = r14;
1937 
1938   //
1939   // We immediately shuffle the arguments so that any vm call we have to
1940   // make from here on out (sync slow path, jvmti, etc.) we will have
1941   // captured the oops from our caller and have a valid oopMap for
1942   // them.
1943 
1944   // -----------------
1945   // The Grand Shuffle
1946 
1947   // The Java calling convention is either equal (linux) or denser (win64) than the
1948   // c calling convention. However the because of the jni_env argument the c calling
1949   // convention always has at least one more (and two for static) arguments than Java.
1950   // Therefore if we move the args from java -> c backwards then we will never have
1951   // a register->register conflict and we don't have to build a dependency graph
1952   // and figure out how to break any cycles.
1953   //
1954 
1955   // Record esp-based slot for receiver on stack for non-static methods
1956   int receiver_offset = -1;
1957 
1958   // This is a trick. We double the stack slots so we can claim
1959   // the oops in the caller's frame. Since we are sure to have
1960   // more args than the caller doubling is enough to make
1961   // sure we can capture all the incoming oop args from the
1962   // caller.
1963   //
1964   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1965 
1966   // Mark location of rbp (someday)
1967   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1968 
1969   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1970   // All inbound args are referenced based on rbp and all outbound args via rsp.
1971 
1972 
1973 #ifdef ASSERT
1974   bool reg_destroyed[Register::number_of_registers];
1975   bool freg_destroyed[XMMRegister::number_of_registers];
1976   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1977     reg_destroyed[r] = false;
1978   }
1979   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1980     freg_destroyed[f] = false;
1981   }
1982 
1983 #endif /* ASSERT */
1984 
1985   // For JNI natives the incoming and outgoing registers are offset upwards.
1986   GrowableArray<int> arg_order(2 * total_in_args);
1987 
1988   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1989     arg_order.push(i);
1990     arg_order.push(c_arg);
1991   }
1992 
1993   for (int ai = 0; ai < arg_order.length(); ai += 2) {
1994     int i = arg_order.at(ai);
1995     int c_arg = arg_order.at(ai + 1);
1996     __ block_comment(err_msg("move %d -> %d", i, c_arg));
1997 #ifdef ASSERT
1998     if (in_regs[i].first()->is_Register()) {
1999       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2000     } else if (in_regs[i].first()->is_XMMRegister()) {
2001       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2002     }
2003     if (out_regs[c_arg].first()->is_Register()) {
2004       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2005     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2006       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2007     }
2008 #endif /* ASSERT */
2009     switch (in_sig_bt[i]) {
2010       case T_ARRAY:
2011       case T_OBJECT:
2012         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2013                     ((i == 0) && (!is_static)),
2014                     &receiver_offset);
2015         break;
2016       case T_VOID:
2017         break;
2018 
2019       case T_FLOAT:
2020         __ float_move(in_regs[i], out_regs[c_arg]);
2021           break;
2022 
2023       case T_DOUBLE:
2024         assert( i + 1 < total_in_args &&
2025                 in_sig_bt[i + 1] == T_VOID &&
2026                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2027         __ double_move(in_regs[i], out_regs[c_arg]);
2028         break;
2029 
2030       case T_LONG :
2031         __ long_move(in_regs[i], out_regs[c_arg]);
2032         break;
2033 
2034       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2035 
2036       default:
2037         __ move32_64(in_regs[i], out_regs[c_arg]);
2038     }
2039   }
2040 
2041   int c_arg;
2042 
2043   // Pre-load a static method's oop into r14.  Used both by locking code and
2044   // the normal JNI call code.
2045   // point c_arg at the first arg that is already loaded in case we
2046   // need to spill before we call out
2047   c_arg = total_c_args - total_in_args;
2048 
2049   if (method->is_static()) {
2050 
2051     //  load oop into a register
2052     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2053 
2054     // Now handlize the static class mirror it's known not-null.
2055     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2056     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2057 
2058     // Now get the handle
2059     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2060     // store the klass handle as second argument
2061     __ movptr(c_rarg1, oop_handle_reg);
2062     // and protect the arg if we must spill
2063     c_arg--;
2064   }
2065 
2066   // Change state to native (we save the return address in the thread, since it might not
2067   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2068   // points into the right code segment. It does not have to be the correct return pc.
2069   // We use the same pc/oopMap repeatedly when we call out
2070 
2071   Label native_return;
2072   if (method->is_object_wait0()) {
2073     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2074     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2075   } else {
2076     intptr_t the_pc = (intptr_t) __ pc();
2077     oop_maps->add_gc_map(the_pc - start, map);
2078 
2079     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2080   }
2081 
2082   // We have all of the arguments setup at this point. We must not touch any register
2083   // argument registers at this point (what if we save/restore them there are no oop?
2084 
2085   if (DTraceMethodProbes) {
2086     // protect the args we've loaded
2087     save_args(masm, total_c_args, c_arg, out_regs);
2088     __ mov_metadata(c_rarg1, method());
2089     __ call_VM_leaf(
2090       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2091       r15_thread, c_rarg1);
2092     restore_args(masm, total_c_args, c_arg, out_regs);
2093   }
2094 
2095   // RedefineClasses() tracing support for obsolete method entry
2096   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2097     // protect the args we've loaded
2098     save_args(masm, total_c_args, c_arg, out_regs);
2099     __ mov_metadata(c_rarg1, method());
2100     __ call_VM_leaf(
2101       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2102       r15_thread, c_rarg1);
2103     restore_args(masm, total_c_args, c_arg, out_regs);
2104   }
2105 
2106   // Lock a synchronized method
2107 
2108   // Register definitions used by locking and unlocking
2109 
2110   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2111   const Register obj_reg  = rbx;  // Will contain the oop
2112   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2113 
2114   Label slow_path_lock;
2115   Label lock_done;
2116 
2117   if (method->is_synchronized()) {
2118     // Get the handle (the 2nd argument)
2119     __ mov(oop_handle_reg, c_rarg1);
2120 
2121     // Get address of the box
2122 
2123     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2124 
2125     // Load the oop from the handle
2126     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2127 
2128     __ fast_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2129 
2130     // Slow path will re-enter here
2131     __ bind(lock_done);
2132   }
2133 
2134   // Finally just about ready to make the JNI call
2135 
2136   // get JNIEnv* which is first argument to native
2137   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2138 
2139   // Now set thread in native
2140   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2141 
2142   __ call(RuntimeAddress(native_func));
2143 
2144   // Verify or restore cpu control state after JNI call
2145   __ restore_cpu_control_state_after_jni(rscratch1);
2146 
2147   // Unpack native results.
2148   switch (ret_type) {
2149   case T_BOOLEAN: __ c2bool(rax);            break;
2150   case T_CHAR   : __ movzwl(rax, rax);      break;
2151   case T_BYTE   : __ sign_extend_byte (rax); break;
2152   case T_SHORT  : __ sign_extend_short(rax); break;
2153   case T_INT    : /* nothing to do */        break;
2154   case T_DOUBLE :
2155   case T_FLOAT  :
2156     // Result is in xmm0 we'll save as needed
2157     break;
2158   case T_ARRAY:                 // Really a handle
2159   case T_OBJECT:                // Really a handle
2160       break; // can't de-handlize until after safepoint check
2161   case T_VOID: break;
2162   case T_LONG: break;
2163   default       : ShouldNotReachHere();
2164   }
2165 
2166   // Switch thread to "native transition" state before reading the synchronization state.
2167   // This additional state is necessary because reading and testing the synchronization
2168   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2169   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2170   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2171   //     Thread A is resumed to finish this native method, but doesn't block here since it
2172   //     didn't see any synchronization is progress, and escapes.
2173   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2174 
2175   // Force this write out before the read below
2176   if (!UseSystemMemoryBarrier) {
2177     __ membar(Assembler::Membar_mask_bits(
2178               Assembler::LoadLoad | Assembler::LoadStore |
2179               Assembler::StoreLoad | Assembler::StoreStore));
2180   }
2181 
2182   // check for safepoint operation in progress and/or pending suspend requests
2183   {
2184     Label Continue;
2185     Label slow_path;
2186 
2187     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2188 
2189     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2190     __ jcc(Assembler::equal, Continue);
2191     __ bind(slow_path);
2192 
2193     // Don't use call_VM as it will see a possible pending exception and forward it
2194     // and never return here preventing us from clearing _last_native_pc down below.
2195     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2196     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2197     // by hand.
2198     //
2199     __ vzeroupper();
2200     save_native_result(masm, ret_type, stack_slots);
2201     __ mov(c_rarg0, r15_thread);
2202     __ mov(r12, rsp); // remember sp
2203     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2204     __ andptr(rsp, -16); // align stack as required by ABI
2205     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2206     __ mov(rsp, r12); // restore sp
2207     __ reinit_heapbase();
2208     // Restore any method result value
2209     restore_native_result(masm, ret_type, stack_slots);
2210     __ bind(Continue);
2211   }
2212 
2213   // change thread state
2214   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2215 
2216   if (method->is_object_wait0()) {
2217     // Check preemption for Object.wait()
2218     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2219     __ cmpptr(rscratch1, NULL_WORD);
2220     __ jccb(Assembler::equal, native_return);
2221     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2222     __ jmp(rscratch1);
2223     __ bind(native_return);
2224 
2225     intptr_t the_pc = (intptr_t) __ pc();
2226     oop_maps->add_gc_map(the_pc - start, map);
2227   }
2228 
2229 
2230   Label reguard;
2231   Label reguard_done;
2232   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2233   __ jcc(Assembler::equal, reguard);
2234   __ bind(reguard_done);
2235 
2236   // native result if any is live
2237 
2238   // Unlock
2239   Label slow_path_unlock;
2240   Label unlock_done;
2241   if (method->is_synchronized()) {
2242 
2243     Label fast_done;
2244 
2245     // Get locked oop from the handle we passed to jni
2246     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2247 
2248     // Must save rax if it is live now because cmpxchg must use it
2249     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2250       save_native_result(masm, ret_type, stack_slots);
2251     }
2252 
2253     __ fast_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2254 
2255     // slow path re-enters here
2256     __ bind(unlock_done);
2257     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2258       restore_native_result(masm, ret_type, stack_slots);
2259     }
2260 
2261     __ bind(fast_done);
2262   }
2263   if (DTraceMethodProbes) {
2264     save_native_result(masm, ret_type, stack_slots);
2265     __ mov_metadata(c_rarg1, method());
2266     __ call_VM_leaf(
2267          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2268          r15_thread, c_rarg1);
2269     restore_native_result(masm, ret_type, stack_slots);
2270   }
2271 
2272   __ reset_last_Java_frame(false);
2273 
2274   // Unbox oop result, e.g. JNIHandles::resolve value.
2275   if (is_reference_type(ret_type)) {
2276     __ resolve_jobject(rax /* value */,
2277                        rcx /* tmp */);
2278   }
2279 
2280   if (CheckJNICalls) {
2281     // clear_pending_jni_exception_check
2282     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2283   }
2284 
2285   // reset handle block
2286   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2287   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2288 
2289   // pop our frame
2290 
2291   __ leave();
2292 
2293 #if INCLUDE_JFR
2294   // We need to do a poll test after unwind in case the sampler
2295   // managed to sample the native frame after returning to Java.
2296   Label L_return;
2297   address poll_test_pc = __ pc();
2298   __ relocate(relocInfo::poll_return_type);
2299   __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2300   __ jccb(Assembler::zero, L_return);
2301   __ lea(rscratch1, InternalAddress(poll_test_pc));
2302   __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2303   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2304     "polling page return stub not created yet");
2305   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2306   __ jump(RuntimeAddress(stub));
2307   __ bind(L_return);
2308 #endif // INCLUDE_JFR
2309 
2310   // Any exception pending?
2311   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2312   __ jcc(Assembler::notEqual, exception_pending);
2313 
2314   // Return
2315 
2316   __ ret(0);
2317 
2318   // Unexpected paths are out of line and go here
2319 
2320   // forward the exception
2321   __ bind(exception_pending);
2322 
2323   // and forward the exception
2324   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2325 
2326   // Slow path locking & unlocking
2327   if (method->is_synchronized()) {
2328 
2329     // BEGIN Slow path lock
2330     __ bind(slow_path_lock);
2331 
2332     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2333     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2334 
2335     // protect the args we've loaded
2336     save_args(masm, total_c_args, c_arg, out_regs);
2337 
2338     __ mov(c_rarg0, obj_reg);
2339     __ mov(c_rarg1, lock_reg);
2340     __ mov(c_rarg2, r15_thread);
2341 
2342     // Not a leaf but we have last_Java_frame setup as we want.
2343     // We don't want to unmount in case of contention since that would complicate preserving
2344     // the arguments that had already been marshalled into the native convention. So we force
2345     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2346     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2347     __ push_cont_fastpath();
2348     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2349     __ pop_cont_fastpath();
2350     restore_args(masm, total_c_args, c_arg, out_regs);
2351 
2352 #ifdef ASSERT
2353     { Label L;
2354     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2355     __ jcc(Assembler::equal, L);
2356     __ stop("no pending exception allowed on exit from monitorenter");
2357     __ bind(L);
2358     }
2359 #endif
2360     __ jmp(lock_done);
2361 
2362     // END Slow path lock
2363 
2364     // BEGIN Slow path unlock
2365     __ bind(slow_path_unlock);
2366 
2367     // If we haven't already saved the native result we must save it now as xmm registers
2368     // are still exposed.
2369     __ vzeroupper();
2370     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2371       save_native_result(masm, ret_type, stack_slots);
2372     }
2373 
2374     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2375 
2376     __ mov(c_rarg0, obj_reg);
2377     __ mov(c_rarg2, r15_thread);
2378     __ mov(r12, rsp); // remember sp
2379     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2380     __ andptr(rsp, -16); // align stack as required by ABI
2381 
2382     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2383     // NOTE that obj_reg == rbx currently
2384     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2385     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2386 
2387     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2388     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2389     __ mov(rsp, r12); // restore sp
2390     __ reinit_heapbase();
2391 #ifdef ASSERT
2392     {
2393       Label L;
2394       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2395       __ jcc(Assembler::equal, L);
2396       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2397       __ bind(L);
2398     }
2399 #endif /* ASSERT */
2400 
2401     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2402 
2403     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2404       restore_native_result(masm, ret_type, stack_slots);
2405     }
2406     __ jmp(unlock_done);
2407 
2408     // END Slow path unlock
2409 
2410   } // synchronized
2411 
2412   // SLOW PATH Reguard the stack if needed
2413 
2414   __ bind(reguard);
2415   __ vzeroupper();
2416   save_native_result(masm, ret_type, stack_slots);
2417   __ mov(r12, rsp); // remember sp
2418   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2419   __ andptr(rsp, -16); // align stack as required by ABI
2420   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2421   __ mov(rsp, r12); // restore sp
2422   __ reinit_heapbase();
2423   restore_native_result(masm, ret_type, stack_slots);
2424   // and continue
2425   __ jmp(reguard_done);
2426 
2427 
2428 
2429   __ flush();
2430 
2431   nmethod *nm = nmethod::new_native_nmethod(method,
2432                                             compile_id,
2433                                             masm->code(),
2434                                             vep_offset,
2435                                             frame_complete,
2436                                             stack_slots / VMRegImpl::slots_per_word,
2437                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2438                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2439                                             oop_maps);
2440 
2441   return nm;
2442 }
2443 
2444 // this function returns the adjust size (in number of words) to a c2i adapter
2445 // activation for use during deoptimization
2446 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2447   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2448 }
2449 
2450 
2451 uint SharedRuntime::out_preserve_stack_slots() {
2452   return 0;
2453 }
2454 
2455 
2456 // Number of stack slots between incoming argument block and the start of
2457 // a new frame.  The PROLOG must add this many slots to the stack.  The
2458 // EPILOG must remove this many slots.  amd64 needs two slots for
2459 // return address.
2460 uint SharedRuntime::in_preserve_stack_slots() {
2461   return 4 + 2 * VerifyStackAtCalls;
2462 }
2463 
2464 VMReg SharedRuntime::thread_register() {
2465   return r15_thread->as_VMReg();
2466 }
2467 
2468 //------------------------------generate_deopt_blob----------------------------
2469 void SharedRuntime::generate_deopt_blob() {
2470   // Allocate space for the code
2471   ResourceMark rm;
2472   // Setup code generation tools
2473   int pad = 0;
2474   if (UseAVX > 2) {
2475     pad += 1024;
2476   }
2477   if (UseAPX) {
2478     pad += 1024;
2479   }
2480   const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2481   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2482   if (blob != nullptr) {
2483     _deopt_blob = blob->as_deoptimization_blob();
2484     return;
2485   }
2486 
2487   CodeBuffer buffer(name, 2560+pad, 1024);
2488   MacroAssembler* masm = new MacroAssembler(&buffer);
2489   int frame_size_in_words;
2490   OopMap* map = nullptr;
2491   OopMapSet *oop_maps = new OopMapSet();
2492 
2493   // -------------
2494   // This code enters when returning to a de-optimized nmethod.  A return
2495   // address has been pushed on the stack, and return values are in
2496   // registers.
2497   // If we are doing a normal deopt then we were called from the patched
2498   // nmethod from the point we returned to the nmethod. So the return
2499   // address on the stack is wrong by NativeCall::instruction_size
2500   // We will adjust the value so it looks like we have the original return
2501   // address on the stack (like when we eagerly deoptimized).
2502   // In the case of an exception pending when deoptimizing, we enter
2503   // with a return address on the stack that points after the call we patched
2504   // into the exception handler. We have the following register state from,
2505   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2506   //    rax: exception oop
2507   //    rbx: exception handler
2508   //    rdx: throwing pc
2509   // So in this case we simply jam rdx into the useless return address and
2510   // the stack looks just like we want.
2511   //
2512   // At this point we need to de-opt.  We save the argument return
2513   // registers.  We call the first C routine, fetch_unroll_info().  This
2514   // routine captures the return values and returns a structure which
2515   // describes the current frame size and the sizes of all replacement frames.
2516   // The current frame is compiled code and may contain many inlined
2517   // functions, each with their own JVM state.  We pop the current frame, then
2518   // push all the new frames.  Then we call the C routine unpack_frames() to
2519   // populate these frames.  Finally unpack_frames() returns us the new target
2520   // address.  Notice that callee-save registers are BLOWN here; they have
2521   // already been captured in the vframeArray at the time the return PC was
2522   // patched.
2523   address start = __ pc();
2524   Label cont;
2525 
2526   // Prolog for non exception case!
2527 
2528   // Save everything in sight.
2529   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2530 
2531   // Normal deoptimization.  Save exec mode for unpack_frames.
2532   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2533   __ jmp(cont);
2534 
2535   int reexecute_offset = __ pc() - start;
2536   // Reexecute case
2537   // return address is the pc describes what bci to do re-execute at
2538 
2539   // No need to update map as each call to save_live_registers will produce identical oopmap
2540   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2541 
2542   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2543   __ jmp(cont);
2544 
2545   int exception_offset = __ pc() - start;
2546 
2547   // Prolog for exception case
2548 
2549   // all registers are dead at this entry point, except for rax, and
2550   // rdx which contain the exception oop and exception pc
2551   // respectively.  Set them in TLS and fall thru to the
2552   // unpack_with_exception_in_tls entry point.
2553 
2554   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2555   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2556 
2557   int exception_in_tls_offset = __ pc() - start;
2558 
2559   // new implementation because exception oop is now passed in JavaThread
2560 
2561   // Prolog for exception case
2562   // All registers must be preserved because they might be used by LinearScan
2563   // Exceptiop oop and throwing PC are passed in JavaThread
2564   // tos: stack at point of call to method that threw the exception (i.e. only
2565   // args are on the stack, no return address)
2566 
2567   // make room on stack for the return address
2568   // It will be patched later with the throwing pc. The correct value is not
2569   // available now because loading it from memory would destroy registers.
2570   __ push(0);
2571 
2572   // Save everything in sight.
2573   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2574 
2575   // Now it is safe to overwrite any register
2576 
2577   // Deopt during an exception.  Save exec mode for unpack_frames.
2578   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2579 
2580   // load throwing pc from JavaThread and patch it as the return address
2581   // of the current frame. Then clear the field in JavaThread
2582 
2583   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2584   __ movptr(Address(rbp, wordSize), rdx);
2585   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2586 
2587 #ifdef ASSERT
2588   // verify that there is really an exception oop in JavaThread
2589   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2590   __ verify_oop(rax);
2591 
2592   // verify that there is no pending exception
2593   Label no_pending_exception;
2594   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2595   __ testptr(rax, rax);
2596   __ jcc(Assembler::zero, no_pending_exception);
2597   __ stop("must not have pending exception here");
2598   __ bind(no_pending_exception);
2599 #endif
2600 
2601   __ bind(cont);
2602 
2603   // Call C code.  Need thread and this frame, but NOT official VM entry
2604   // crud.  We cannot block on this call, no GC can happen.
2605   //
2606   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2607 
2608   // fetch_unroll_info needs to call last_java_frame().
2609 
2610   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2611 #ifdef ASSERT
2612   { Label L;
2613     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2614     __ jcc(Assembler::equal, L);
2615     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2616     __ bind(L);
2617   }
2618 #endif // ASSERT
2619   __ mov(c_rarg0, r15_thread);
2620   __ movl(c_rarg1, r14); // exec_mode
2621   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2622 
2623   // Need to have an oopmap that tells fetch_unroll_info where to
2624   // find any register it might need.
2625   oop_maps->add_gc_map(__ pc() - start, map);
2626 
2627   __ reset_last_Java_frame(false);
2628 
2629   // Load UnrollBlock* into rdi
2630   __ mov(rdi, rax);
2631 
2632   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2633    Label noException;
2634   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2635   __ jcc(Assembler::notEqual, noException);
2636   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2637   // QQQ this is useless it was null above
2638   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2639   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2640   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2641 
2642   __ verify_oop(rax);
2643 
2644   // Overwrite the result registers with the exception results.
2645   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2646   // I think this is useless
2647   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2648 
2649   __ bind(noException);
2650 
2651   // Only register save data is on the stack.
2652   // Now restore the result registers.  Everything else is either dead
2653   // or captured in the vframeArray.
2654   RegisterSaver::restore_result_registers(masm);
2655 
2656   // All of the register save area has been popped of the stack. Only the
2657   // return address remains.
2658 
2659   // Pop all the frames we must move/replace.
2660   //
2661   // Frame picture (youngest to oldest)
2662   // 1: self-frame (no frame link)
2663   // 2: deopting frame  (no frame link)
2664   // 3: caller of deopting frame (could be compiled/interpreted).
2665   //
2666   // Note: by leaving the return address of self-frame on the stack
2667   // and using the size of frame 2 to adjust the stack
2668   // when we are done the return to frame 3 will still be on the stack.
2669 
2670   // Pop deoptimized frame
2671   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2672   __ addptr(rsp, rcx);
2673 
2674   // rsp should be pointing at the return address to the caller (3)
2675 
2676   // Pick up the initial fp we should save
2677   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2678   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2679 
2680 #ifdef ASSERT
2681   // Compilers generate code that bang the stack by as much as the
2682   // interpreter would need. So this stack banging should never
2683   // trigger a fault. Verify that it does not on non product builds.
2684   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2685   __ bang_stack_size(rbx, rcx);
2686 #endif
2687 
2688   // Load address of array of frame pcs into rcx
2689   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2690 
2691   // Trash the old pc
2692   __ addptr(rsp, wordSize);
2693 
2694   // Load address of array of frame sizes into rsi
2695   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2696 
2697   // Load counter into rdx
2698   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2699 
2700   // Now adjust the caller's stack to make up for the extra locals
2701   // but record the original sp so that we can save it in the skeletal interpreter
2702   // frame and the stack walking of interpreter_sender will get the unextended sp
2703   // value and not the "real" sp value.
2704 
2705   const Register sender_sp = r8;
2706 
2707   __ mov(sender_sp, rsp);
2708   __ movl(rbx, Address(rdi,
2709                        Deoptimization::UnrollBlock::
2710                        caller_adjustment_offset()));
2711   __ subptr(rsp, rbx);
2712 
2713   // Push interpreter frames in a loop
2714   Label loop;
2715   __ bind(loop);
2716   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2717   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2718   __ pushptr(Address(rcx, 0));          // Save return address
2719   __ enter();                           // Save old & set new ebp
2720   __ subptr(rsp, rbx);                  // Prolog
2721   // This value is corrected by layout_activation_impl
2722   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2723   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2724   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2725   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2726   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2727   __ decrementl(rdx);                   // Decrement counter
2728   __ jcc(Assembler::notZero, loop);
2729   __ pushptr(Address(rcx, 0));          // Save final return address
2730 
2731   // Re-push self-frame
2732   __ enter();                           // Save old & set new ebp
2733 
2734   // Allocate a full sized register save area.
2735   // Return address and rbp are in place, so we allocate two less words.
2736   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2737 
2738   // Restore frame locals after moving the frame
2739   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2740   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2741 
2742   // Call C code.  Need thread but NOT official VM entry
2743   // crud.  We cannot block on this call, no GC can happen.  Call should
2744   // restore return values to their stack-slots with the new SP.
2745   //
2746   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2747 
2748   // Use rbp because the frames look interpreted now
2749   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2750   // Don't need the precise return PC here, just precise enough to point into this code blob.
2751   address the_pc = __ pc();
2752   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2753 
2754   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2755   __ mov(c_rarg0, r15_thread);
2756   __ movl(c_rarg1, r14); // second arg: exec_mode
2757   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2758   // Revert SP alignment after call since we're going to do some SP relative addressing below
2759   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2760 
2761   // Set an oopmap for the call site
2762   // Use the same PC we used for the last java frame
2763   oop_maps->add_gc_map(the_pc - start,
2764                        new OopMap( frame_size_in_words, 0 ));
2765 
2766   // Clear fp AND pc
2767   __ reset_last_Java_frame(true);
2768 
2769   // Collect return values
2770   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2771   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2772   // I think this is useless (throwing pc?)
2773   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2774 
2775   // Pop self-frame.
2776   __ leave();                           // Epilog
2777 
2778   // Jump to interpreter
2779   __ ret(0);
2780 
2781   // Make sure all code is generated
2782   masm->flush();
2783 
2784   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2785   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2786 
2787   AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2788 }
2789 
2790 //------------------------------generate_handler_blob------
2791 //
2792 // Generate a special Compile2Runtime blob that saves all registers,
2793 // and setup oopmap.
2794 //
2795 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
2796   assert(StubRoutines::forward_exception_entry() != nullptr,
2797          "must be generated before");
2798   assert(is_polling_page_id(id), "expected a polling page stub id");
2799 
2800   // Allocate space for the code.  Setup code generation tools.
2801   const char* name = SharedRuntime::stub_name(id);
2802   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
2803   if (blob != nullptr) {
2804     return blob->as_safepoint_blob();
2805   }
2806 
2807   ResourceMark rm;
2808   OopMapSet *oop_maps = new OopMapSet();
2809   OopMap* map;
2810   CodeBuffer buffer(name, 2548, 1024);
2811   MacroAssembler* masm = new MacroAssembler(&buffer);
2812 
2813   address start   = __ pc();
2814   address call_pc = nullptr;
2815   int frame_size_in_words;
2816   bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
2817   bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
2818 
2819   // Make room for return address (or push it again)
2820   if (!cause_return) {
2821     __ push(rbx);
2822   }
2823 
2824   // Save registers, fpu state, and flags
2825   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
2826 
2827   // The following is basically a call_VM.  However, we need the precise
2828   // address of the call in order to generate an oopmap. Hence, we do all the
2829   // work ourselves.
2830 
2831   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
2832 
2833   // The return address must always be correct so that frame constructor never
2834   // sees an invalid pc.
2835 
2836   if (!cause_return) {
2837     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2838     // Additionally, rbx is a callee saved register and we can look at it later to determine
2839     // if someone changed the return address for us!
2840     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
2841     __ movptr(Address(rbp, wordSize), rbx);
2842   }
2843 
2844   // Do the call
2845   __ mov(c_rarg0, r15_thread);
2846   __ call(RuntimeAddress(call_ptr));
2847 
2848   // Set an oopmap for the call site.  This oopmap will map all
2849   // oop-registers and debug-info registers as callee-saved.  This
2850   // will allow deoptimization at this safepoint to find all possible
2851   // debug-info recordings, as well as let GC find all oops.
2852 
2853   oop_maps->add_gc_map( __ pc() - start, map);
2854 
2855   Label noException;
2856 
2857   __ reset_last_Java_frame(false);
2858 
2859   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
2860   __ jcc(Assembler::equal, noException);
2861 
2862   // Exception pending
2863 
2864   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
2865 
2866   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2867 
2868   // No exception case
2869   __ bind(noException);
2870 
2871   Label no_adjust;
2872 #ifdef ASSERT
2873   Label bail;
2874 #endif
2875   if (!cause_return) {
2876     Label no_prefix, not_special, check_rex_prefix;
2877 
2878     // If our stashed return pc was modified by the runtime we avoid touching it
2879     __ cmpptr(rbx, Address(rbp, wordSize));
2880     __ jcc(Assembler::notEqual, no_adjust);
2881 
2882     // Skip over the poll instruction.
2883     // See NativeInstruction::is_safepoint_poll()
2884     // Possible encodings:
2885     //      85 00       test   %eax,(%rax)
2886     //      85 01       test   %eax,(%rcx)
2887     //      85 02       test   %eax,(%rdx)
2888     //      85 03       test   %eax,(%rbx)
2889     //      85 06       test   %eax,(%rsi)
2890     //      85 07       test   %eax,(%rdi)
2891     //
2892     //   41 85 00       test   %eax,(%r8)
2893     //   41 85 01       test   %eax,(%r9)
2894     //   41 85 02       test   %eax,(%r10)
2895     //   41 85 03       test   %eax,(%r11)
2896     //   41 85 06       test   %eax,(%r14)
2897     //   41 85 07       test   %eax,(%r15)
2898     //
2899     //      85 04 24    test   %eax,(%rsp)
2900     //   41 85 04 24    test   %eax,(%r12)
2901     //      85 45 00    test   %eax,0x0(%rbp)
2902     //   41 85 45 00    test   %eax,0x0(%r13)
2903     //
2904     // Notes:
2905     //  Format of legacy MAP0 test instruction:-
2906     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
2907     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
2908     //     operand and base register of memory operand is b/w [0-8), hence we do not require
2909     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
2910     //     is why two bytes encoding is sufficient here.
2911     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
2912     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
2913     //     there by adding additional byte to instruction encoding.
2914     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
2915     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
2916     //     most significant two bits of 5 bit register encoding.
2917 
2918     if (VM_Version::supports_apx_f()) {
2919       __ cmpb(Address(rbx, 0), Assembler::REX2);
2920       __ jccb(Assembler::notEqual, check_rex_prefix);
2921       __ addptr(rbx, 2);
2922       __ bind(check_rex_prefix);
2923     }
2924     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
2925     __ jccb(Assembler::notEqual, no_prefix);
2926     __ addptr(rbx, 1);
2927     __ bind(no_prefix);
2928 #ifdef ASSERT
2929     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
2930 #endif
2931     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
2932     // r12/rsp 0x04
2933     // r13/rbp 0x05
2934     __ movzbq(rcx, Address(rbx, 1));
2935     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
2936     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
2937     __ cmpptr(rcx, 1);
2938     __ jccb(Assembler::above, not_special);
2939     __ addptr(rbx, 1);
2940     __ bind(not_special);
2941 #ifdef ASSERT
2942     // Verify the correct encoding of the poll we're about to skip.
2943     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
2944     __ jcc(Assembler::notEqual, bail);
2945     // Mask out the modrm bits
2946     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
2947     // rax encodes to 0, so if the bits are nonzero it's incorrect
2948     __ jcc(Assembler::notZero, bail);
2949 #endif
2950     // Adjust return pc forward to step over the safepoint poll instruction
2951     __ addptr(rbx, 2);
2952     __ movptr(Address(rbp, wordSize), rbx);
2953   }
2954 
2955   __ bind(no_adjust);
2956   // Normal exit, restore registers and exit.
2957   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
2958   __ ret(0);
2959 
2960 #ifdef ASSERT
2961   __ bind(bail);
2962   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
2963 #endif
2964 
2965   // Make sure all code is generated
2966   masm->flush();
2967 
2968   // Fill-out other meta info
2969   SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
2970 
2971   AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
2972   return sp_blob;
2973 }
2974 
2975 //
2976 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
2977 //
2978 // Generate a stub that calls into vm to find out the proper destination
2979 // of a java call. All the argument registers are live at this point
2980 // but since this is generic code we don't know what they are and the caller
2981 // must do any gc of the args.
2982 //
2983 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
2984   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
2985   assert(is_resolve_id(id), "expected a resolve stub id");
2986 
2987   const char* name = SharedRuntime::stub_name(id);
2988   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
2989   if (blob != nullptr) {
2990     return blob->as_runtime_stub();
2991   }
2992 
2993   // allocate space for the code
2994   ResourceMark rm;
2995   CodeBuffer buffer(name, 1552, 512);
2996   MacroAssembler* masm = new MacroAssembler(&buffer);
2997 
2998   int frame_size_in_words;
2999 
3000   OopMapSet *oop_maps = new OopMapSet();
3001   OopMap* map = nullptr;
3002 
3003   int start = __ offset();
3004 
3005   // No need to save vector registers since they are caller-saved anyway.
3006   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3007 
3008   int frame_complete = __ offset();
3009 
3010   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3011 
3012   __ mov(c_rarg0, r15_thread);
3013 
3014   __ call(RuntimeAddress(destination));
3015 
3016 
3017   // Set an oopmap for the call site.
3018   // We need this not only for callee-saved registers, but also for volatile
3019   // registers that the compiler might be keeping live across a safepoint.
3020 
3021   oop_maps->add_gc_map( __ offset() - start, map);
3022 
3023   // rax contains the address we are going to jump to assuming no exception got installed
3024 
3025   // clear last_Java_sp
3026   __ reset_last_Java_frame(false);
3027   // check for pending exceptions
3028   Label pending;
3029   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3030   __ jcc(Assembler::notEqual, pending);
3031 
3032   // get the returned Method*
3033   __ get_vm_result_metadata(rbx);
3034   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3035 
3036   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3037 
3038   RegisterSaver::restore_live_registers(masm);
3039 
3040   // We are back to the original state on entry and ready to go.
3041 
3042   __ jmp(rax);
3043 
3044   // Pending exception after the safepoint
3045 
3046   __ bind(pending);
3047 
3048   RegisterSaver::restore_live_registers(masm);
3049 
3050   // exception pending => remove activation and forward to exception handler
3051 
3052   __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3053 
3054   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3055   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3056 
3057   // -------------
3058   // make sure all code is generated
3059   masm->flush();
3060 
3061   // return the  blob
3062   // frame_size_words or bytes??
3063   RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3064 
3065   AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3066   return rs_blob;
3067 }
3068 
3069 // Continuation point for throwing of implicit exceptions that are
3070 // not handled in the current activation. Fabricates an exception
3071 // oop and initiates normal exception dispatching in this
3072 // frame. Since we need to preserve callee-saved values (currently
3073 // only for C2, but done for C1 as well) we need a callee-saved oop
3074 // map and therefore have to make these stubs into RuntimeStubs
3075 // rather than BufferBlobs.  If the compiler needs all registers to
3076 // be preserved between the fault point and the exception handler
3077 // then it must assume responsibility for that in
3078 // AbstractCompiler::continuation_for_implicit_null_exception or
3079 // continuation_for_implicit_division_by_zero_exception. All other
3080 // implicit exceptions (e.g., NullPointerException or
3081 // AbstractMethodError on entry) are either at call sites or
3082 // otherwise assume that stack unwinding will be initiated, so
3083 // caller saved registers were assumed volatile in the compiler.
3084 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3085   assert(is_throw_id(id), "expected a throw stub id");
3086 
3087   const char* name = SharedRuntime::stub_name(id);
3088 
3089   // Information about frame layout at time of blocking runtime call.
3090   // Note that we only have to preserve callee-saved registers since
3091   // the compilers are responsible for supplying a continuation point
3092   // if they expect all registers to be preserved.
3093   enum layout {
3094     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3095     rbp_off2,
3096     return_off,
3097     return_off2,
3098     framesize // inclusive of return address
3099   };
3100 
3101   int insts_size = 512;
3102   int locs_size  = 64;
3103 
3104   const char* timer_msg = "SharedRuntime generate_throw_exception";
3105   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3106 
3107   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3108   if (blob != nullptr) {
3109     return blob->as_runtime_stub();
3110   }
3111 
3112   ResourceMark rm;
3113   CodeBuffer code(name, insts_size, locs_size);
3114   OopMapSet* oop_maps  = new OopMapSet();
3115   MacroAssembler* masm = new MacroAssembler(&code);
3116 
3117   address start = __ pc();
3118 
3119   // This is an inlined and slightly modified version of call_VM
3120   // which has the ability to fetch the return PC out of
3121   // thread-local storage and also sets up last_Java_sp slightly
3122   // differently than the real call_VM
3123 
3124   __ enter(); // required for proper stackwalking of RuntimeStub frame
3125 
3126   assert(is_even(framesize/2), "sp not 16-byte aligned");
3127 
3128   // return address and rbp are already in place
3129   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3130 
3131   int frame_complete = __ pc() - start;
3132 
3133   // Set up last_Java_sp and last_Java_fp
3134   address the_pc = __ pc();
3135   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3136   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3137 
3138   // Call runtime
3139   __ movptr(c_rarg0, r15_thread);
3140   BLOCK_COMMENT("call runtime_entry");
3141   __ call(RuntimeAddress(runtime_entry));
3142 
3143   // Generate oop map
3144   OopMap* map = new OopMap(framesize, 0);
3145 
3146   oop_maps->add_gc_map(the_pc - start, map);
3147 
3148   __ reset_last_Java_frame(true);
3149 
3150   __ leave(); // required for proper stackwalking of RuntimeStub frame
3151 
3152   // check for pending exceptions
3153 #ifdef ASSERT
3154   Label L;
3155   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3156   __ jcc(Assembler::notEqual, L);
3157   __ should_not_reach_here();
3158   __ bind(L);
3159 #endif // ASSERT
3160   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3161 
3162 
3163   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3164   RuntimeStub* stub =
3165     RuntimeStub::new_runtime_stub(name,
3166                                   &code,
3167                                   frame_complete,
3168                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3169                                   oop_maps, false);
3170   AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3171 
3172   return stub;
3173 }
3174 
3175 //------------------------------Montgomery multiplication------------------------
3176 //
3177 
3178 #ifndef _WINDOWS
3179 
3180 // Subtract 0:b from carry:a.  Return carry.
3181 static julong
3182 sub(julong a[], julong b[], julong carry, long len) {
3183   long long i = 0, cnt = len;
3184   julong tmp;
3185   asm volatile("clc; "
3186                "0: ; "
3187                "mov (%[b], %[i], 8), %[tmp]; "
3188                "sbb %[tmp], (%[a], %[i], 8); "
3189                "inc %[i]; dec %[cnt]; "
3190                "jne 0b; "
3191                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3192                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3193                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3194                : "memory");
3195   return tmp;
3196 }
3197 
3198 // Multiply (unsigned) Long A by Long B, accumulating the double-
3199 // length result into the accumulator formed of T0, T1, and T2.
3200 #define MACC(A, B, T0, T1, T2)                                  \
3201 do {                                                            \
3202   unsigned long hi, lo;                                         \
3203   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3204            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3205            : "r"(A), "a"(B) : "cc");                            \
3206  } while(0)
3207 
3208 // As above, but add twice the double-length result into the
3209 // accumulator.
3210 #define MACC2(A, B, T0, T1, T2)                                 \
3211 do {                                                            \
3212   unsigned long hi, lo;                                         \
3213   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3214            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3215            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3216            : "r"(A), "a"(B) : "cc");                            \
3217  } while(0)
3218 
3219 #else //_WINDOWS
3220 
3221 static julong
3222 sub(julong a[], julong b[], julong carry, long len) {
3223   long i;
3224   julong tmp;
3225   unsigned char c = 1;
3226   for (i = 0; i < len; i++) {
3227     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3228     a[i] = tmp;
3229   }
3230   c = _addcarry_u64(c, carry, ~0, &tmp);
3231   return tmp;
3232 }
3233 
3234 // Multiply (unsigned) Long A by Long B, accumulating the double-
3235 // length result into the accumulator formed of T0, T1, and T2.
3236 #define MACC(A, B, T0, T1, T2)                          \
3237 do {                                                    \
3238   julong hi, lo;                            \
3239   lo = _umul128(A, B, &hi);                             \
3240   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3241   c = _addcarry_u64(c, hi, T1, &T1);                    \
3242   _addcarry_u64(c, T2, 0, &T2);                         \
3243  } while(0)
3244 
3245 // As above, but add twice the double-length result into the
3246 // accumulator.
3247 #define MACC2(A, B, T0, T1, T2)                         \
3248 do {                                                    \
3249   julong hi, lo;                            \
3250   lo = _umul128(A, B, &hi);                             \
3251   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3252   c = _addcarry_u64(c, hi, T1, &T1);                    \
3253   _addcarry_u64(c, T2, 0, &T2);                         \
3254   c = _addcarry_u64(0, lo, T0, &T0);                    \
3255   c = _addcarry_u64(c, hi, T1, &T1);                    \
3256   _addcarry_u64(c, T2, 0, &T2);                         \
3257  } while(0)
3258 
3259 #endif //_WINDOWS
3260 
3261 // Fast Montgomery multiplication.  The derivation of the algorithm is
3262 // in  A Cryptographic Library for the Motorola DSP56000,
3263 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3264 
3265 static void NOINLINE
3266 montgomery_multiply(julong a[], julong b[], julong n[],
3267                     julong m[], julong inv, int len) {
3268   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3269   int i;
3270 
3271   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3272 
3273   for (i = 0; i < len; i++) {
3274     int j;
3275     for (j = 0; j < i; j++) {
3276       MACC(a[j], b[i-j], t0, t1, t2);
3277       MACC(m[j], n[i-j], t0, t1, t2);
3278     }
3279     MACC(a[i], b[0], t0, t1, t2);
3280     m[i] = t0 * inv;
3281     MACC(m[i], n[0], t0, t1, t2);
3282 
3283     assert(t0 == 0, "broken Montgomery multiply");
3284 
3285     t0 = t1; t1 = t2; t2 = 0;
3286   }
3287 
3288   for (i = len; i < 2*len; i++) {
3289     int j;
3290     for (j = i-len+1; j < len; j++) {
3291       MACC(a[j], b[i-j], t0, t1, t2);
3292       MACC(m[j], n[i-j], t0, t1, t2);
3293     }
3294     m[i-len] = t0;
3295     t0 = t1; t1 = t2; t2 = 0;
3296   }
3297 
3298   while (t0)
3299     t0 = sub(m, n, t0, len);
3300 }
3301 
3302 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3303 // multiplies so it should be up to 25% faster than Montgomery
3304 // multiplication.  However, its loop control is more complex and it
3305 // may actually run slower on some machines.
3306 
3307 static void NOINLINE
3308 montgomery_square(julong a[], julong n[],
3309                   julong m[], julong inv, int len) {
3310   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3311   int i;
3312 
3313   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3314 
3315   for (i = 0; i < len; i++) {
3316     int j;
3317     int end = (i+1)/2;
3318     for (j = 0; j < end; j++) {
3319       MACC2(a[j], a[i-j], t0, t1, t2);
3320       MACC(m[j], n[i-j], t0, t1, t2);
3321     }
3322     if ((i & 1) == 0) {
3323       MACC(a[j], a[j], t0, t1, t2);
3324     }
3325     for (; j < i; j++) {
3326       MACC(m[j], n[i-j], t0, t1, t2);
3327     }
3328     m[i] = t0 * inv;
3329     MACC(m[i], n[0], t0, t1, t2);
3330 
3331     assert(t0 == 0, "broken Montgomery square");
3332 
3333     t0 = t1; t1 = t2; t2 = 0;
3334   }
3335 
3336   for (i = len; i < 2*len; i++) {
3337     int start = i-len+1;
3338     int end = start + (len - start)/2;
3339     int j;
3340     for (j = start; j < end; j++) {
3341       MACC2(a[j], a[i-j], t0, t1, t2);
3342       MACC(m[j], n[i-j], t0, t1, t2);
3343     }
3344     if ((i & 1) == 0) {
3345       MACC(a[j], a[j], t0, t1, t2);
3346     }
3347     for (; j < len; j++) {
3348       MACC(m[j], n[i-j], t0, t1, t2);
3349     }
3350     m[i-len] = t0;
3351     t0 = t1; t1 = t2; t2 = 0;
3352   }
3353 
3354   while (t0)
3355     t0 = sub(m, n, t0, len);
3356 }
3357 
3358 // Swap words in a longword.
3359 static julong swap(julong x) {
3360   return (x << 32) | (x >> 32);
3361 }
3362 
3363 // Copy len longwords from s to d, word-swapping as we go.  The
3364 // destination array is reversed.
3365 static void reverse_words(julong *s, julong *d, int len) {
3366   d += len;
3367   while(len-- > 0) {
3368     d--;
3369     *d = swap(*s);
3370     s++;
3371   }
3372 }
3373 
3374 // The threshold at which squaring is advantageous was determined
3375 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3376 #define MONTGOMERY_SQUARING_THRESHOLD 64
3377 
3378 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3379                                         jint len, jlong inv,
3380                                         jint *m_ints) {
3381   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3382   int longwords = len/2;
3383 
3384   // Make very sure we don't use so much space that the stack might
3385   // overflow.  512 jints corresponds to an 16384-bit integer and
3386   // will use here a total of 8k bytes of stack space.
3387   int divisor = sizeof(julong) * 4;
3388   guarantee(longwords <= 8192 / divisor, "must be");
3389   int total_allocation = longwords * sizeof (julong) * 4;
3390   julong *scratch = (julong *)alloca(total_allocation);
3391 
3392   // Local scratch arrays
3393   julong
3394     *a = scratch + 0 * longwords,
3395     *b = scratch + 1 * longwords,
3396     *n = scratch + 2 * longwords,
3397     *m = scratch + 3 * longwords;
3398 
3399   reverse_words((julong *)a_ints, a, longwords);
3400   reverse_words((julong *)b_ints, b, longwords);
3401   reverse_words((julong *)n_ints, n, longwords);
3402 
3403   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3404 
3405   reverse_words(m, (julong *)m_ints, longwords);
3406 }
3407 
3408 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3409                                       jint len, jlong inv,
3410                                       jint *m_ints) {
3411   assert(len % 2 == 0, "array length in montgomery_square must be even");
3412   int longwords = len/2;
3413 
3414   // Make very sure we don't use so much space that the stack might
3415   // overflow.  512 jints corresponds to an 16384-bit integer and
3416   // will use here a total of 6k bytes of stack space.
3417   int divisor = sizeof(julong) * 3;
3418   guarantee(longwords <= (8192 / divisor), "must be");
3419   int total_allocation = longwords * sizeof (julong) * 3;
3420   julong *scratch = (julong *)alloca(total_allocation);
3421 
3422   // Local scratch arrays
3423   julong
3424     *a = scratch + 0 * longwords,
3425     *n = scratch + 1 * longwords,
3426     *m = scratch + 2 * longwords;
3427 
3428   reverse_words((julong *)a_ints, a, longwords);
3429   reverse_words((julong *)n_ints, n, longwords);
3430 
3431   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3432     ::montgomery_square(a, n, m, (julong)inv, longwords);
3433   } else {
3434     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3435   }
3436 
3437   reverse_words(m, (julong *)m_ints, longwords);
3438 }
3439 
3440 #if INCLUDE_JFR
3441 
3442 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3443 // It returns a jobject handle to the event writer.
3444 // The handle is dereferenced and the return value is the event writer oop.
3445 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3446   enum layout {
3447     rbp_off,
3448     rbpH_off,
3449     return_off,
3450     return_off2,
3451     framesize // inclusive of return address
3452   };
3453 
3454   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3455   CodeBuffer code(name, 1024, 64);
3456   MacroAssembler* masm = new MacroAssembler(&code);
3457   address start = __ pc();
3458 
3459   __ enter();
3460   address the_pc = __ pc();
3461 
3462   int frame_complete = the_pc - start;
3463 
3464   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3465   __ movptr(c_rarg0, r15_thread);
3466   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3467   __ reset_last_Java_frame(true);
3468 
3469   // rax is jobject handle result, unpack and process it through a barrier.
3470   __ resolve_global_jobject(rax, c_rarg0);
3471 
3472   __ leave();
3473   __ ret(0);
3474 
3475   OopMapSet* oop_maps = new OopMapSet();
3476   OopMap* map = new OopMap(framesize, 1);
3477   oop_maps->add_gc_map(frame_complete, map);
3478 
3479   RuntimeStub* stub =
3480     RuntimeStub::new_runtime_stub(name,
3481                                   &code,
3482                                   frame_complete,
3483                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3484                                   oop_maps,
3485                                   false);
3486   return stub;
3487 }
3488 
3489 // For c2: call to return a leased buffer.
3490 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3491   enum layout {
3492     rbp_off,
3493     rbpH_off,
3494     return_off,
3495     return_off2,
3496     framesize // inclusive of return address
3497   };
3498 
3499   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
3500   CodeBuffer code(name, 1024, 64);
3501   MacroAssembler* masm = new MacroAssembler(&code);
3502   address start = __ pc();
3503 
3504   __ enter();
3505   address the_pc = __ pc();
3506 
3507   int frame_complete = the_pc - start;
3508 
3509   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3510   __ movptr(c_rarg0, r15_thread);
3511   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3512   __ reset_last_Java_frame(true);
3513 
3514   __ leave();
3515   __ ret(0);
3516 
3517   OopMapSet* oop_maps = new OopMapSet();
3518   OopMap* map = new OopMap(framesize, 1);
3519   oop_maps->add_gc_map(frame_complete, map);
3520 
3521   RuntimeStub* stub =
3522     RuntimeStub::new_runtime_stub(name,
3523                                   &code,
3524                                   frame_complete,
3525                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3526                                   oop_maps,
3527                                   false);
3528   return stub;
3529 }
3530 
3531 #endif // INCLUDE_JFR