New src/hotspot/cpu/x86/sharedRuntime_x86

   1 /*
   2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "code/aotCodeCache.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/timerTrace.hpp"
  56 #include "runtime/vframeArray.hpp"
  57 #include "runtime/vm_version.hpp"
  58 #include "utilities/align.hpp"
  59 #include "utilities/checkedCast.hpp"
  60 #include "utilities/formatBuffer.hpp"
  61 #include "vmreg_x86.inline.hpp"
  62 #ifdef COMPILER1
  63 #include "c1/c1_Runtime1.hpp"
  64 #endif
  65 #ifdef COMPILER2
  66 #include "opto/runtime.hpp"
  67 #endif
  68 #if INCLUDE_SHENANDOAHGC
  69 #include "gc/shenandoah/shenandoahRuntime.hpp"
  70 #endif
  71 #if INCLUDE_JVMCI
  72 #include "jvmci/jvmciJavaClasses.hpp"
  73 #endif
  74 
  75 #define __ masm->
  76 
  77 #ifdef PRODUCT
  78 #define BLOCK_COMMENT(str) /* nothing */
  79 #else
  80 #define BLOCK_COMMENT(str) __ block_comment(str)
  81 #endif // PRODUCT
  82 
  83 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  84 
  85 class RegisterSaver {
  86   // Capture info about frame layout.  Layout offsets are in jint
  87   // units because compiler frame slots are jints.
  88 #define XSAVE_AREA_BEGIN 160
  89 #define XSAVE_AREA_YMM_BEGIN 576
  90 #define XSAVE_AREA_EGPRS 960
  91 #define XSAVE_AREA_OPMASK_BEGIN 1088
  92 #define XSAVE_AREA_ZMM_BEGIN 1152
  93 #define XSAVE_AREA_UPPERBANK 1664
  94 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  95 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  96 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  97 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  98 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  99   enum layout {
 100     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
 101     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
 102     DEF_XMM_OFFS(0),
 103     DEF_XMM_OFFS(1),
 104     // 2..15 are implied in range usage
 105     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 106     DEF_YMM_OFFS(0),
 107     DEF_YMM_OFFS(1),
 108     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 109     r16H_off,
 110     r17_off, r17H_off,
 111     r18_off, r18H_off,
 112     r19_off, r19H_off,
 113     r20_off, r20H_off,
 114     r21_off, r21H_off,
 115     r22_off, r22H_off,
 116     r23_off, r23H_off,
 117     r24_off, r24H_off,
 118     r25_off, r25H_off,
 119     r26_off, r26H_off,
 120     r27_off, r27H_off,
 121     r28_off, r28H_off,
 122     r29_off, r29H_off,
 123     r30_off, r30H_off,
 124     r31_off, r31H_off,
 125     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 126     DEF_OPMASK_OFFS(0),
 127     DEF_OPMASK_OFFS(1),
 128     // 2..7 are implied in range usage
 129     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 130     DEF_ZMM_OFFS(0),
 131     DEF_ZMM_OFFS(1),
 132     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 133     DEF_ZMM_UPPER_OFFS(16),
 134     DEF_ZMM_UPPER_OFFS(17),
 135     // 18..31 are implied in range usage
 136     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 137     fpu_stateH_end,
 138     r15_off, r15H_off,
 139     r14_off, r14H_off,
 140     r13_off, r13H_off,
 141     r12_off, r12H_off,
 142     r11_off, r11H_off,
 143     r10_off, r10H_off,
 144     r9_off,  r9H_off,
 145     r8_off,  r8H_off,
 146     rdi_off, rdiH_off,
 147     rsi_off, rsiH_off,
 148     ignore_off, ignoreH_off,  // extra copy of rbp
 149     rsp_off, rspH_off,
 150     rbx_off, rbxH_off,
 151     rdx_off, rdxH_off,
 152     rcx_off, rcxH_off,
 153     rax_off, raxH_off,
 154     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 155     align_off, alignH_off,
 156     flags_off, flagsH_off,
 157     // The frame sender code expects that rbp will be in the "natural" place and
 158     // will override any oopMap setting for it. We must therefore force the layout
 159     // so that it agrees with the frame sender code.
 160     rbp_off, rbpH_off,        // copy of rbp we will restore
 161     return_off, returnH_off,  // slot for return address
 162     reg_save_size             // size in compiler stack slots
 163   };
 164 
 165   static void adjust_wide_vectors_support(bool& wide_vectors);
 166 
 167  public:
 168   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 169   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 170 
 171   // Offsets into the register save area
 172   // Used by deoptimization when it is managing result register
 173   // values on its own
 174 
 175   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 176   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 177   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 178   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 179   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 180   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 181 
 182   // During deoptimization only the result registers need to be restored,
 183   // all the other values have already been extracted.
 184   static void restore_result_registers(MacroAssembler* masm);
 185 };
 186 
 187 // TODO: Should be upstreamed separately.
 188 void RegisterSaver::adjust_wide_vectors_support(bool& wide_vectors) {
 189 #if COMPILER2_OR_JVMCI
 190   if (wide_vectors && UseAVX == 0) {
 191     wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 192   }
 193   assert(!wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 194 #else
 195   wide_vectors = false; // vectors are generated only by C2 and JVMCI
 196 #endif
 197 }
 198 
 199 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 200   int off = 0;
 201   int num_xmm_regs = XMMRegister::available_xmm_registers();
 202 
 203   adjust_wide_vectors_support(save_wide_vectors);
 204 
 205   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 206   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 207   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 208   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 209   // CodeBlob frame size is in words.
 210   int frame_size_in_words = frame_size_in_bytes / wordSize;
 211   *total_frame_words = frame_size_in_words;
 212 
 213   // Save registers, fpu state, and flags.
 214   // We assume caller has already pushed the return address onto the
 215   // stack, so rsp is 8-byte aligned here.
 216   // We push rpb twice in this sequence because we want the real rbp
 217   // to be under the return like a normal enter.
 218 
 219   __ enter();          // rsp becomes 16-byte aligned here
 220   __ pushf();
 221   // Make sure rsp stays 16-byte aligned
 222   __ subq(rsp, 8);
 223   // Push CPU state in multiple of 16 bytes
 224   __ save_legacy_gprs();
 225   __ push_FPU_state();
 226 
 227 
 228   // push cpu state handles this on EVEX enabled targets
 229   if (save_wide_vectors) {
 230     // Save upper half of YMM registers(0..15)
 231     int base_addr = XSAVE_AREA_YMM_BEGIN;
 232     for (int n = 0; n < 16; n++) {
 233       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 234     }
 235     if (VM_Version::supports_evex()) {
 236       // Save upper half of ZMM registers(0..15)
 237       base_addr = XSAVE_AREA_ZMM_BEGIN;
 238       for (int n = 0; n < 16; n++) {
 239         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 240       }
 241       // Save full ZMM registers(16..num_xmm_regs)
 242       base_addr = XSAVE_AREA_UPPERBANK;
 243       off = 0;
 244       int vector_len = Assembler::AVX_512bit;
 245       for (int n = 16; n < num_xmm_regs; n++) {
 246         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 247       }
 248 #if COMPILER2_OR_JVMCI
 249       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 250       off = 0;
 251       for(int n = 0; n < KRegister::number_of_registers; n++) {
 252         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 253       }
 254 #endif
 255     }
 256   } else {
 257     if (VM_Version::supports_evex()) {
 258       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 259       int base_addr = XSAVE_AREA_UPPERBANK;
 260       off = 0;
 261       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 262       for (int n = 16; n < num_xmm_regs; n++) {
 263         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 264       }
 265 #if COMPILER2_OR_JVMCI
 266       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 267       off = 0;
 268       for(int n = 0; n < KRegister::number_of_registers; n++) {
 269         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 270       }
 271 #endif
 272     }
 273   }
 274 
 275 #if COMPILER2_OR_JVMCI
 276   if (UseAPX) {
 277       int base_addr = XSAVE_AREA_EGPRS;
 278       off = 0;
 279       for (int n = 16; n < Register::number_of_registers; n++) {
 280         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 281       }
 282   }
 283 #endif
 284 
 285   __ vzeroupper();
 286   if (frame::arg_reg_save_area_bytes != 0) {
 287     // Allocate argument register save area
 288     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 289   }
 290 
 291   // Set an oopmap for the call site.  This oopmap will map all
 292   // oop-registers and debug-info registers as callee-saved.  This
 293   // will allow deoptimization at this safepoint to find all possible
 294   // debug-info recordings, as well as let GC find all oops.
 295 
 296   OopMapSet *oop_maps = new OopMapSet();
 297   OopMap* map = new OopMap(frame_size_in_slots, 0);
 298 
 299 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 300 
 301   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 305   // rbp location is known implicitly by the frame sender code, needs no oopmap
 306   // and the location where rbp was saved by is ignored
 307   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 308   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 309   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 310   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 311   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 312   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 313   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 314   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 315   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 316   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 317 
 318   if (UseAPX) {
 319     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 324     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 325     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 326     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 327     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 328     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 329     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 330     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 331     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 332     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 333     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 334     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 335   }
 336   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 337   // on EVEX enabled targets, we get it included in the xsave area
 338   off = xmm0_off;
 339   int delta = xmm1_off - off;
 340   for (int n = 0; n < 16; n++) {
 341     XMMRegister xmm_name = as_XMMRegister(n);
 342     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 343     off += delta;
 344   }
 345   if (UseAVX > 2) {
 346     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 347     off = zmm16_off;
 348     delta = zmm17_off - off;
 349     for (int n = 16; n < num_xmm_regs; n++) {
 350       XMMRegister zmm_name = as_XMMRegister(n);
 351       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 352       off += delta;
 353     }
 354   }
 355 
 356 #if COMPILER2_OR_JVMCI
 357   if (save_wide_vectors) {
 358     // Save upper half of YMM registers(0..15)
 359     off = ymm0_off;
 360     delta = ymm1_off - ymm0_off;
 361     for (int n = 0; n < 16; n++) {
 362       XMMRegister ymm_name = as_XMMRegister(n);
 363       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 364       off += delta;
 365     }
 366     if (VM_Version::supports_evex()) {
 367       // Save upper half of ZMM registers(0..15)
 368       off = zmm0_off;
 369       delta = zmm1_off - zmm0_off;
 370       for (int n = 0; n < 16; n++) {
 371         XMMRegister zmm_name = as_XMMRegister(n);
 372         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 373         off += delta;
 374       }
 375     }
 376   }
 377 #endif // COMPILER2_OR_JVMCI
 378 
 379   // %%% These should all be a waste but we'll keep things as they were for now
 380   if (true) {
 381     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 385     // rbp location is known implicitly by the frame sender code, needs no oopmap
 386     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 387     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 388     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 389     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 390     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 391     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 392     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 393     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 394     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 395     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 396     if (UseAPX) {
 397       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 402       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 403       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 404       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 405       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 406       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 407       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 408       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 409       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 410       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 411       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 412       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 413     }
 414     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 415     // on EVEX enabled targets, we get it included in the xsave area
 416     off = xmm0H_off;
 417     delta = xmm1H_off - off;
 418     for (int n = 0; n < 16; n++) {
 419       XMMRegister xmm_name = as_XMMRegister(n);
 420       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 421       off += delta;
 422     }
 423     if (UseAVX > 2) {
 424       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 425       off = zmm16H_off;
 426       delta = zmm17H_off - off;
 427       for (int n = 16; n < num_xmm_regs; n++) {
 428         XMMRegister zmm_name = as_XMMRegister(n);
 429         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 430         off += delta;
 431       }
 432     }
 433   }
 434 
 435   return map;
 436 }
 437 
 438 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 439   int num_xmm_regs = XMMRegister::available_xmm_registers();
 440   if (frame::arg_reg_save_area_bytes != 0) {
 441     // Pop arg register save area
 442     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 443   }
 444 
 445   adjust_wide_vectors_support(restore_wide_vectors);
 446 
 447   __ vzeroupper();
 448 
 449   // On EVEX enabled targets everything is handled in pop fpu state
 450   if (restore_wide_vectors) {
 451     // Restore upper half of YMM registers (0..15)
 452     int base_addr = XSAVE_AREA_YMM_BEGIN;
 453     for (int n = 0; n < 16; n++) {
 454       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 455     }
 456     if (VM_Version::supports_evex()) {
 457       // Restore upper half of ZMM registers (0..15)
 458       base_addr = XSAVE_AREA_ZMM_BEGIN;
 459       for (int n = 0; n < 16; n++) {
 460         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 461       }
 462       // Restore full ZMM registers(16..num_xmm_regs)
 463       base_addr = XSAVE_AREA_UPPERBANK;
 464       int vector_len = Assembler::AVX_512bit;
 465       int off = 0;
 466       for (int n = 16; n < num_xmm_regs; n++) {
 467         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 468       }
 469 #if COMPILER2_OR_JVMCI
 470       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 471       off = 0;
 472       for (int n = 0; n < KRegister::number_of_registers; n++) {
 473         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 474       }
 475 #endif
 476     }
 477   } else {
 478     if (VM_Version::supports_evex()) {
 479       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 480       int base_addr = XSAVE_AREA_UPPERBANK;
 481       int off = 0;
 482       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 483       for (int n = 16; n < num_xmm_regs; n++) {
 484         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 485       }
 486 #if COMPILER2_OR_JVMCI
 487       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 488       off = 0;
 489       for (int n = 0; n < KRegister::number_of_registers; n++) {
 490         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 491       }
 492 #endif
 493     }
 494   }
 495 
 496 #if COMPILER2_OR_JVMCI
 497   if (UseAPX) {
 498     int base_addr = XSAVE_AREA_EGPRS;
 499     int off = 0;
 500     for (int n = 16; n < Register::number_of_registers; n++) {
 501       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 502     }
 503   }
 504 #endif
 505 
 506   // Recover CPU state
 507   __ pop_FPU_state();
 508   __ restore_legacy_gprs();
 509   __ addq(rsp, 8);
 510   __ popf();
 511   // Get the rbp described implicitly by the calling convention (no oopMap)
 512   __ pop(rbp);
 513 }
 514 
 515 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 516 
 517   // Just restore result register. Only used by deoptimization. By
 518   // now any callee save register that needs to be restored to a c2
 519   // caller of the deoptee has been extracted into the vframeArray
 520   // and will be stuffed into the c2i adapter we create for later
 521   // restoration so only result registers need to be restored here.
 522 
 523   // Restore fp result register
 524   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 525   // Restore integer result register
 526   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 527   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 528 
 529   // Pop all of the register save are off the stack except the return address
 530   __ addptr(rsp, return_offset_in_bytes());
 531 }
 532 
 533 // Is vector's size (in bytes) bigger than a size saved by default?
 534 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 535 bool SharedRuntime::is_wide_vector(int size) {
 536   return size > 16;
 537 }
 538 
 539 // ---------------------------------------------------------------------------
 540 // Read the array of BasicTypes from a signature, and compute where the
 541 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 542 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 543 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 544 // as framesizes are fixed.
 545 // VMRegImpl::stack0 refers to the first slot 0(sp).
 546 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 547 // Register up to Register::number_of_registers are the 64-bit
 548 // integer registers.
 549 
 550 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 551 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 552 // units regardless of build. Of course for i486 there is no 64 bit build
 553 
 554 // The Java calling convention is a "shifted" version of the C ABI.
 555 // By skipping the first C ABI register we can call non-static jni methods
 556 // with small numbers of arguments without having to shuffle the arguments
 557 // at all. Since we control the java ABI we ought to at least get some
 558 // advantage out of it.
 559 
 560 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 561                                            VMRegPair *regs,
 562                                            int total_args_passed) {
 563 
 564   // Create the mapping between argument positions and
 565   // registers.
 566   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 567     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 568   };
 569   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 570     j_farg0, j_farg1, j_farg2, j_farg3,
 571     j_farg4, j_farg5, j_farg6, j_farg7
 572   };
 573 
 574 
 575   uint int_args = 0;
 576   uint fp_args = 0;
 577   uint stk_args = 0;
 578 
 579   for (int i = 0; i < total_args_passed; i++) {
 580     switch (sig_bt[i]) {
 581     case T_BOOLEAN:
 582     case T_CHAR:
 583     case T_BYTE:
 584     case T_SHORT:
 585     case T_INT:
 586       if (int_args < Argument::n_int_register_parameters_j) {
 587         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 588       } else {
 589         stk_args = align_up(stk_args, 2);
 590         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 591         stk_args += 1;
 592       }
 593       break;
 594     case T_VOID:
 595       // halves of T_LONG or T_DOUBLE
 596       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 597       regs[i].set_bad();
 598       break;
 599     case T_LONG:
 600       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 601       // fall through
 602     case T_OBJECT:
 603     case T_ARRAY:
 604     case T_ADDRESS:
 605       if (int_args < Argument::n_int_register_parameters_j) {
 606         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 607       } else {
 608         stk_args = align_up(stk_args, 2);
 609         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 610         stk_args += 2;
 611       }
 612       break;
 613     case T_FLOAT:
 614       if (fp_args < Argument::n_float_register_parameters_j) {
 615         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 616       } else {
 617         stk_args = align_up(stk_args, 2);
 618         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 619         stk_args += 1;
 620       }
 621       break;
 622     case T_DOUBLE:
 623       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 624       if (fp_args < Argument::n_float_register_parameters_j) {
 625         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 626       } else {
 627         stk_args = align_up(stk_args, 2);
 628         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 629         stk_args += 2;
 630       }
 631       break;
 632     default:
 633       ShouldNotReachHere();
 634       break;
 635     }
 636   }
 637 
 638   return stk_args;
 639 }
 640 
 641 // Patch the callers callsite with entry to compiled code if it exists.
 642 static void patch_callers_callsite(MacroAssembler *masm) {
 643   Label L;
 644   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 645   __ jcc(Assembler::equal, L);
 646 
 647   // Save the current stack pointer
 648   __ mov(r13, rsp);
 649   // Schedule the branch target address early.
 650   // Call into the VM to patch the caller, then jump to compiled callee
 651   // rax isn't live so capture return address while we easily can
 652   __ movptr(rax, Address(rsp, 0));
 653 
 654   // align stack so push_CPU_state doesn't fault
 655   __ andptr(rsp, -(StackAlignmentInBytes));
 656   __ push_CPU_state();
 657   __ vzeroupper();
 658   // VM needs caller's callsite
 659   // VM needs target method
 660   // This needs to be a long call since we will relocate this adapter to
 661   // the codeBuffer and it may not reach
 662 
 663   // Allocate argument register save area
 664   if (frame::arg_reg_save_area_bytes != 0) {
 665     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 666   }
 667   __ mov(c_rarg0, rbx);
 668   __ mov(c_rarg1, rax);
 669   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 670 
 671   // De-allocate argument register save area
 672   if (frame::arg_reg_save_area_bytes != 0) {
 673     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 674   }
 675 
 676   __ vzeroupper();
 677   __ pop_CPU_state();
 678   // restore sp
 679   __ mov(rsp, r13);
 680   __ bind(L);
 681 }
 682 
 683 static void gen_c2i_adapter(MacroAssembler *masm,
 684                             int total_args_passed,
 685                             int comp_args_on_stack,
 686                             const BasicType *sig_bt,
 687                             const VMRegPair *regs,
 688                             Label& skip_fixup) {
 689   // Before we get into the guts of the C2I adapter, see if we should be here
 690   // at all.  We've come from compiled code and are attempting to jump to the
 691   // interpreter, which means the caller made a static call to get here
 692   // (vcalls always get a compiled target if there is one).  Check for a
 693   // compiled target.  If there is one, we need to patch the caller's call.
 694   patch_callers_callsite(masm);
 695 
 696   __ bind(skip_fixup);
 697 
 698   // Since all args are passed on the stack, total_args_passed *
 699   // Interpreter::stackElementSize is the space we need.
 700 
 701   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 702 
 703   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 704 
 705   // stack is aligned, keep it that way
 706   // This is not currently needed or enforced by the interpreter, but
 707   // we might as well conform to the ABI.
 708   extraspace = align_up(extraspace, 2*wordSize);
 709 
 710   // set senderSP value
 711   __ lea(r13, Address(rsp, wordSize));
 712 
 713 #ifdef ASSERT
 714   __ check_stack_alignment(r13, "sender stack not aligned");
 715 #endif
 716   if (extraspace > 0) {
 717     // Pop the return address
 718     __ pop(rax);
 719 
 720     __ subptr(rsp, extraspace);
 721 
 722     // Push the return address
 723     __ push(rax);
 724 
 725     // Account for the return address location since we store it first rather
 726     // than hold it in a register across all the shuffling
 727     extraspace += wordSize;
 728   }
 729 
 730 #ifdef ASSERT
 731   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 732 #endif
 733 
 734   // Now write the args into the outgoing interpreter space
 735   for (int i = 0; i < total_args_passed; i++) {
 736     if (sig_bt[i] == T_VOID) {
 737       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 738       continue;
 739     }
 740 
 741     // offset to start parameters
 742     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 743     int next_off = st_off - Interpreter::stackElementSize;
 744 
 745     // Say 4 args:
 746     // i   st_off
 747     // 0   32 T_LONG
 748     // 1   24 T_VOID
 749     // 2   16 T_OBJECT
 750     // 3    8 T_BOOL
 751     // -    0 return address
 752     //
 753     // However to make thing extra confusing. Because we can fit a long/double in
 754     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 755     // leaves one slot empty and only stores to a single slot. In this case the
 756     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 757 
 758     VMReg r_1 = regs[i].first();
 759     VMReg r_2 = regs[i].second();
 760     if (!r_1->is_valid()) {
 761       assert(!r_2->is_valid(), "");
 762       continue;
 763     }
 764     if (r_1->is_stack()) {
 765       // memory to memory use rax
 766       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 767       if (!r_2->is_valid()) {
 768         // sign extend??
 769         __ movl(rax, Address(rsp, ld_off));
 770         __ movptr(Address(rsp, st_off), rax);
 771 
 772       } else {
 773 
 774         __ movq(rax, Address(rsp, ld_off));
 775 
 776         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 777         // T_DOUBLE and T_LONG use two slots in the interpreter
 778         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 779           // ld_off == LSW, ld_off+wordSize == MSW
 780           // st_off == MSW, next_off == LSW
 781           __ movq(Address(rsp, next_off), rax);
 782 #ifdef ASSERT
 783           // Overwrite the unused slot with known junk
 784           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 785           __ movptr(Address(rsp, st_off), rax);
 786 #endif /* ASSERT */
 787         } else {
 788           __ movq(Address(rsp, st_off), rax);
 789         }
 790       }
 791     } else if (r_1->is_Register()) {
 792       Register r = r_1->as_Register();
 793       if (!r_2->is_valid()) {
 794         // must be only an int (or less ) so move only 32bits to slot
 795         // why not sign extend??
 796         __ movl(Address(rsp, st_off), r);
 797       } else {
 798         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 799         // T_DOUBLE and T_LONG use two slots in the interpreter
 800         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 801           // long/double in gpr
 802 #ifdef ASSERT
 803           // Overwrite the unused slot with known junk
 804           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 805           __ movptr(Address(rsp, st_off), rax);
 806 #endif /* ASSERT */
 807           __ movq(Address(rsp, next_off), r);
 808         } else {
 809           __ movptr(Address(rsp, st_off), r);
 810         }
 811       }
 812     } else {
 813       assert(r_1->is_XMMRegister(), "");
 814       if (!r_2->is_valid()) {
 815         // only a float use just part of the slot
 816         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 817       } else {
 818 #ifdef ASSERT
 819         // Overwrite the unused slot with known junk
 820         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 821         __ movptr(Address(rsp, st_off), rax);
 822 #endif /* ASSERT */
 823         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 824       }
 825     }
 826   }
 827 
 828   // Schedule the branch target address early.
 829   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 830   __ jmp(rcx);
 831 }
 832 
 833 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 834                                     int total_args_passed,
 835                                     int comp_args_on_stack,
 836                                     const BasicType *sig_bt,
 837                                     const VMRegPair *regs) {
 838 
 839   // Note: r13 contains the senderSP on entry. We must preserve it since
 840   // we may do a i2c -> c2i transition if we lose a race where compiled
 841   // code goes non-entrant while we get args ready.
 842   // In addition we use r13 to locate all the interpreter args as
 843   // we must align the stack to 16 bytes on an i2c entry else we
 844   // lose alignment we expect in all compiled code and register
 845   // save code can segv when fxsave instructions find improperly
 846   // aligned stack pointer.
 847 
 848   // Adapters can be frameless because they do not require the caller
 849   // to perform additional cleanup work, such as correcting the stack pointer.
 850   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 851   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 852   // even if a callee has modified the stack pointer.
 853   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 854   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 855   // up via the senderSP register).
 856   // In other words, if *either* the caller or callee is interpreted, we can
 857   // get the stack pointer repaired after a call.
 858   // This is why c2i and i2c adapters cannot be indefinitely composed.
 859   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 860   // both caller and callee would be compiled methods, and neither would
 861   // clean up the stack pointer changes performed by the two adapters.
 862   // If this happens, control eventually transfers back to the compiled
 863   // caller, but with an uncorrected stack, causing delayed havoc.
 864 
 865   // Must preserve original SP for loading incoming arguments because
 866   // we need to align the outgoing SP for compiled code.
 867   __ movptr(r11, rsp);
 868 
 869   // Pick up the return address
 870   __ pop(rax);
 871 
 872   // Convert 4-byte c2 stack slots to words.
 873   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 874 
 875   if (comp_args_on_stack) {
 876     __ subptr(rsp, comp_words_on_stack * wordSize);
 877   }
 878 
 879   // Ensure compiled code always sees stack at proper alignment
 880   __ andptr(rsp, -16);
 881 
 882   // push the return address and misalign the stack that youngest frame always sees
 883   // as far as the placement of the call instruction
 884   __ push(rax);
 885 
 886   // Put saved SP in another register
 887   const Register saved_sp = rax;
 888   __ movptr(saved_sp, r11);
 889 
 890   // Will jump to the compiled code just as if compiled code was doing it.
 891   // Pre-load the register-jump target early, to schedule it better.
 892   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 893 
 894 #if INCLUDE_JVMCI
 895   if (EnableJVMCI) {
 896     // check if this call should be routed towards a specific entry point
 897     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 898     Label no_alternative_target;
 899     __ jcc(Assembler::equal, no_alternative_target);
 900     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 901     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 902     __ bind(no_alternative_target);
 903   }
 904 #endif // INCLUDE_JVMCI
 905 
 906   // Now generate the shuffle code.  Pick up all register args and move the
 907   // rest through the floating point stack top.
 908   for (int i = 0; i < total_args_passed; i++) {
 909     if (sig_bt[i] == T_VOID) {
 910       // Longs and doubles are passed in native word order, but misaligned
 911       // in the 32-bit build.
 912       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 913       continue;
 914     }
 915 
 916     // Pick up 0, 1 or 2 words from SP+offset.
 917 
 918     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 919             "scrambled load targets?");
 920     // Load in argument order going down.
 921     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 922     // Point to interpreter value (vs. tag)
 923     int next_off = ld_off - Interpreter::stackElementSize;
 924     //
 925     //
 926     //
 927     VMReg r_1 = regs[i].first();
 928     VMReg r_2 = regs[i].second();
 929     if (!r_1->is_valid()) {
 930       assert(!r_2->is_valid(), "");
 931       continue;
 932     }
 933     if (r_1->is_stack()) {
 934       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 935       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 936 
 937       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 938       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 939       // will be generated.
 940       if (!r_2->is_valid()) {
 941         // sign extend???
 942         __ movl(r13, Address(saved_sp, ld_off));
 943         __ movptr(Address(rsp, st_off), r13);
 944       } else {
 945         //
 946         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 947         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 948         // So we must adjust where to pick up the data to match the interpreter.
 949         //
 950         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 951         // are accessed as negative so LSW is at LOW address
 952 
 953         // ld_off is MSW so get LSW
 954         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 955                            next_off : ld_off;
 956         __ movq(r13, Address(saved_sp, offset));
 957         // st_off is LSW (i.e. reg.first())
 958         __ movq(Address(rsp, st_off), r13);
 959       }
 960     } else if (r_1->is_Register()) {  // Register argument
 961       Register r = r_1->as_Register();
 962       assert(r != rax, "must be different");
 963       if (r_2->is_valid()) {
 964         //
 965         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 966         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 967         // So we must adjust where to pick up the data to match the interpreter.
 968 
 969         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 970                            next_off : ld_off;
 971 
 972         // this can be a misaligned move
 973         __ movq(r, Address(saved_sp, offset));
 974       } else {
 975         // sign extend and use a full word?
 976         __ movl(r, Address(saved_sp, ld_off));
 977       }
 978     } else {
 979       if (!r_2->is_valid()) {
 980         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 981       } else {
 982         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 983       }
 984     }
 985   }
 986 
 987   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 988 
 989   // 6243940 We might end up in handle_wrong_method if
 990   // the callee is deoptimized as we race thru here. If that
 991   // happens we don't want to take a safepoint because the
 992   // caller frame will look interpreted and arguments are now
 993   // "compiled" so it is much better to make this transition
 994   // invisible to the stack walking code. Unfortunately if
 995   // we try and find the callee by normal means a safepoint
 996   // is possible. So we stash the desired callee in the thread
 997   // and the vm will find there should this case occur.
 998 
 999   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1000 
1001   // put Method* where a c2i would expect should we end up there
1002   // only needed because eof c2 resolve stubs return Method* as a result in
1003   // rax
1004   __ mov(rax, rbx);
1005   __ jmp(r11);
1006 }
1007 
1008 // ---------------------------------------------------------------
1009 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1010                                             int total_args_passed,
1011                                             int comp_args_on_stack,
1012                                             const BasicType *sig_bt,
1013                                             const VMRegPair *regs,
1014                                             address entry_address[AdapterBlob::ENTRY_COUNT]) {
1015   entry_address[AdapterBlob::I2C] = __ pc();
1016 
1017   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1018 
1019   // -------------------------------------------------------------------------
1020   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1021   // to the interpreter.  The args start out packed in the compiled layout.  They
1022   // need to be unpacked into the interpreter layout.  This will almost always
1023   // require some stack space.  We grow the current (compiled) stack, then repack
1024   // the args.  We  finally end in a jump to the generic interpreter entry point.
1025   // On exit from the interpreter, the interpreter will restore our SP (lest the
1026   // compiled code, which relies solely on SP and not RBP, get sick).
1027 
1028   entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1029   Label skip_fixup;
1030 
1031   Register data = rax;
1032   Register receiver = j_rarg0;
1033   Register temp = rbx;
1034 
1035   {
1036     __ ic_check(1 /* end_alignment */);
1037     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1038     // Method might have been compiled since the call site was patched to
1039     // interpreted if that is the case treat it as a miss so we can get
1040     // the call site corrected.
1041     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1042     __ jcc(Assembler::equal, skip_fixup);
1043     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1044   }
1045 
1046   entry_address[AdapterBlob::C2I] = __ pc();
1047 
1048   // Class initialization barrier for static methods
1049   entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1050   assert(VM_Version::supports_fast_class_init_checks(), "sanity");
1051   Label L_skip_barrier;
1052   Register method = rbx;
1053 
1054   // Bypass the barrier for non-static methods
1055   Register flags = rscratch1;
1056   __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1057   __ testl(flags, JVM_ACC_STATIC);
1058   __ jcc(Assembler::zero, L_skip_barrier); // non-static
1059 
1060   Register klass = rscratch1;
1061   __ load_method_holder(klass, method);
1062   __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1063 
1064   __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1065 
1066   __ bind(L_skip_barrier);
1067   entry_address[AdapterBlob::C2I_No_Clinit_Check] = __ pc();
1068 
1069   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1070   bs->c2i_entry_barrier(masm);
1071 
1072   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1073   return;
1074 }
1075 
1076 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1077                                          VMRegPair *regs,
1078                                          int total_args_passed) {
1079 
1080 // We return the amount of VMRegImpl stack slots we need to reserve for all
1081 // the arguments NOT counting out_preserve_stack_slots.
1082 
1083 // NOTE: These arrays will have to change when c1 is ported
1084 #ifdef _WIN64
1085     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1086       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1087     };
1088     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1089       c_farg0, c_farg1, c_farg2, c_farg3
1090     };
1091 #else
1092     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1093       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1094     };
1095     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1096       c_farg0, c_farg1, c_farg2, c_farg3,
1097       c_farg4, c_farg5, c_farg6, c_farg7
1098     };
1099 #endif // _WIN64
1100 
1101 
1102     uint int_args = 0;
1103     uint fp_args = 0;
1104     uint stk_args = 0; // inc by 2 each time
1105 
1106     for (int i = 0; i < total_args_passed; i++) {
1107       switch (sig_bt[i]) {
1108       case T_BOOLEAN:
1109       case T_CHAR:
1110       case T_BYTE:
1111       case T_SHORT:
1112       case T_INT:
1113         if (int_args < Argument::n_int_register_parameters_c) {
1114           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1115 #ifdef _WIN64
1116           fp_args++;
1117           // Allocate slots for callee to stuff register args the stack.
1118           stk_args += 2;
1119 #endif
1120         } else {
1121           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1122           stk_args += 2;
1123         }
1124         break;
1125       case T_LONG:
1126         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1127         // fall through
1128       case T_OBJECT:
1129       case T_ARRAY:
1130       case T_ADDRESS:
1131       case T_METADATA:
1132         if (int_args < Argument::n_int_register_parameters_c) {
1133           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1134 #ifdef _WIN64
1135           fp_args++;
1136           stk_args += 2;
1137 #endif
1138         } else {
1139           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1140           stk_args += 2;
1141         }
1142         break;
1143       case T_FLOAT:
1144         if (fp_args < Argument::n_float_register_parameters_c) {
1145           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1146 #ifdef _WIN64
1147           int_args++;
1148           // Allocate slots for callee to stuff register args the stack.
1149           stk_args += 2;
1150 #endif
1151         } else {
1152           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1153           stk_args += 2;
1154         }
1155         break;
1156       case T_DOUBLE:
1157         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1158         if (fp_args < Argument::n_float_register_parameters_c) {
1159           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1160 #ifdef _WIN64
1161           int_args++;
1162           // Allocate slots for callee to stuff register args the stack.
1163           stk_args += 2;
1164 #endif
1165         } else {
1166           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1167           stk_args += 2;
1168         }
1169         break;
1170       case T_VOID: // Halves of longs and doubles
1171         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1172         regs[i].set_bad();
1173         break;
1174       default:
1175         ShouldNotReachHere();
1176         break;
1177       }
1178     }
1179 #ifdef _WIN64
1180   // windows abi requires that we always allocate enough stack space
1181   // for 4 64bit registers to be stored down.
1182   if (stk_args < 8) {
1183     stk_args = 8;
1184   }
1185 #endif // _WIN64
1186 
1187   return stk_args;
1188 }
1189 
1190 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1191                                              uint num_bits,
1192                                              uint total_args_passed) {
1193   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1194          "only certain vector sizes are supported for now");
1195 
1196   static const XMMRegister VEC_ArgReg[32] = {
1197      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1198      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1199     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1200     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1201   };
1202 
1203   uint stk_args = 0;
1204   uint fp_args = 0;
1205 
1206   for (uint i = 0; i < total_args_passed; i++) {
1207     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1208     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1209     regs[i].set_pair(vmreg->next(next_val), vmreg);
1210   }
1211 
1212   return stk_args;
1213 }
1214 
1215 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1216   // We always ignore the frame_slots arg and just use the space just below frame pointer
1217   // which by this time is free to use
1218   switch (ret_type) {
1219   case T_FLOAT:
1220     __ movflt(Address(rbp, -wordSize), xmm0);
1221     break;
1222   case T_DOUBLE:
1223     __ movdbl(Address(rbp, -wordSize), xmm0);
1224     break;
1225   case T_VOID:  break;
1226   default: {
1227     __ movptr(Address(rbp, -wordSize), rax);
1228     }
1229   }
1230 }
1231 
1232 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1233   // We always ignore the frame_slots arg and just use the space just below frame pointer
1234   // which by this time is free to use
1235   switch (ret_type) {
1236   case T_FLOAT:
1237     __ movflt(xmm0, Address(rbp, -wordSize));
1238     break;
1239   case T_DOUBLE:
1240     __ movdbl(xmm0, Address(rbp, -wordSize));
1241     break;
1242   case T_VOID:  break;
1243   default: {
1244     __ movptr(rax, Address(rbp, -wordSize));
1245     }
1246   }
1247 }
1248 
1249 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1250     for ( int i = first_arg ; i < arg_count ; i++ ) {
1251       if (args[i].first()->is_Register()) {
1252         __ push(args[i].first()->as_Register());
1253       } else if (args[i].first()->is_XMMRegister()) {
1254         __ subptr(rsp, 2*wordSize);
1255         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1256       }
1257     }
1258 }
1259 
1260 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1261     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1262       if (args[i].first()->is_Register()) {
1263         __ pop(args[i].first()->as_Register());
1264       } else if (args[i].first()->is_XMMRegister()) {
1265         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1266         __ addptr(rsp, 2*wordSize);
1267       }
1268     }
1269 }
1270 
1271 static void verify_oop_args(MacroAssembler* masm,
1272                             const methodHandle& method,
1273                             const BasicType* sig_bt,
1274                             const VMRegPair* regs) {
1275   Register temp_reg = rbx;  // not part of any compiled calling seq
1276   if (VerifyOops) {
1277     for (int i = 0; i < method->size_of_parameters(); i++) {
1278       if (is_reference_type(sig_bt[i])) {
1279         VMReg r = regs[i].first();
1280         assert(r->is_valid(), "bad oop arg");
1281         if (r->is_stack()) {
1282           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1283           __ verify_oop(temp_reg);
1284         } else {
1285           __ verify_oop(r->as_Register());
1286         }
1287       }
1288     }
1289   }
1290 }
1291 
1292 static void check_continuation_enter_argument(VMReg actual_vmreg,
1293                                               Register expected_reg,
1294                                               const char* name) {
1295   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1296   assert(actual_vmreg->as_Register() == expected_reg,
1297          "%s is in unexpected register: %s instead of %s",
1298          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1299 }
1300 
1301 
1302 //---------------------------- continuation_enter_setup ---------------------------
1303 //
1304 // Arguments:
1305 //   None.
1306 //
1307 // Results:
1308 //   rsp: pointer to blank ContinuationEntry
1309 //
1310 // Kills:
1311 //   rax
1312 //
1313 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1314   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1315   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1316   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1317 
1318   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1319   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1320 
1321   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1322   OopMap* map = new OopMap(frame_size, 0);
1323 
1324   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1325   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1326   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1327 
1328   return map;
1329 }
1330 
1331 //---------------------------- fill_continuation_entry ---------------------------
1332 //
1333 // Arguments:
1334 //   rsp: pointer to blank Continuation entry
1335 //   reg_cont_obj: pointer to the continuation
1336 //   reg_flags: flags
1337 //
1338 // Results:
1339 //   rsp: pointer to filled out ContinuationEntry
1340 //
1341 // Kills:
1342 //   rax
1343 //
1344 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1345   assert_different_registers(rax, reg_cont_obj, reg_flags);
1346 #ifdef ASSERT
1347   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1348 #endif
1349   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1350   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1351   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1352   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1353   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1354 
1355   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1356   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1357 
1358   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1359 }
1360 
1361 //---------------------------- continuation_enter_cleanup ---------------------------
1362 //
1363 // Arguments:
1364 //   rsp: pointer to the ContinuationEntry
1365 //
1366 // Results:
1367 //   rsp: pointer to the spilled rbp in the entry frame
1368 //
1369 // Kills:
1370 //   rbx
1371 //
1372 static void continuation_enter_cleanup(MacroAssembler* masm) {
1373 #ifdef ASSERT
1374   Label L_good_sp;
1375   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1376   __ jcc(Assembler::equal, L_good_sp);
1377   __ stop("Incorrect rsp at continuation_enter_cleanup");
1378   __ bind(L_good_sp);
1379 #endif
1380   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1381   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1382   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1383   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1384   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1385 }
1386 
1387 static void gen_continuation_enter(MacroAssembler* masm,
1388                                    const VMRegPair* regs,
1389                                    int& exception_offset,
1390                                    OopMapSet* oop_maps,
1391                                    int& frame_complete,
1392                                    int& stack_slots,
1393                                    int& interpreted_entry_offset,
1394                                    int& compiled_entry_offset) {
1395 
1396   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1397   int pos_cont_obj   = 0;
1398   int pos_is_cont    = 1;
1399   int pos_is_virtual = 2;
1400 
1401   // The platform-specific calling convention may present the arguments in various registers.
1402   // To simplify the rest of the code, we expect the arguments to reside at these known
1403   // registers, and we additionally check the placement here in case calling convention ever
1404   // changes.
1405   Register reg_cont_obj   = c_rarg1;
1406   Register reg_is_cont    = c_rarg2;
1407   Register reg_is_virtual = c_rarg3;
1408 
1409   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1410   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1411   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1412 
1413   // Utility methods kill rax, make sure there are no collisions
1414   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1415 
1416   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1417                          relocInfo::static_call_type);
1418 
1419   address start = __ pc();
1420 
1421   Label L_thaw, L_exit;
1422 
1423   // i2i entry used at interp_only_mode only
1424   interpreted_entry_offset = __ pc() - start;
1425   {
1426 #ifdef ASSERT
1427     Label is_interp_only;
1428     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1429     __ jcc(Assembler::notEqual, is_interp_only);
1430     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1431     __ bind(is_interp_only);
1432 #endif
1433 
1434     __ pop(rax); // return address
1435     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1436     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1437     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1438     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1439     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1440     __ push(rax); // return address
1441     __ push_cont_fastpath();
1442 
1443     __ enter();
1444 
1445     stack_slots = 2; // will be adjusted in setup
1446     OopMap* map = continuation_enter_setup(masm, stack_slots);
1447     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1448     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1449 
1450     __ verify_oop(reg_cont_obj);
1451 
1452     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1453 
1454     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1455     __ testptr(reg_is_cont, reg_is_cont);
1456     __ jcc(Assembler::notZero, L_thaw);
1457 
1458     // --- Resolve path
1459 
1460     // Make sure the call is patchable
1461     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1462     // Emit stub for static call
1463     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1464     if (stub == nullptr) {
1465       fatal("CodeCache is full at gen_continuation_enter");
1466     }
1467     __ call(resolve);
1468     oop_maps->add_gc_map(__ pc() - start, map);
1469     __ post_call_nop();
1470 
1471     __ jmp(L_exit);
1472   }
1473 
1474   // compiled entry
1475   __ align(CodeEntryAlignment);
1476   compiled_entry_offset = __ pc() - start;
1477   __ enter();
1478 
1479   stack_slots = 2; // will be adjusted in setup
1480   OopMap* map = continuation_enter_setup(masm, stack_slots);
1481 
1482   // Frame is now completed as far as size and linkage.
1483   frame_complete = __ pc() - start;
1484 
1485   __ verify_oop(reg_cont_obj);
1486 
1487   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1488 
1489   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1490   __ testptr(reg_is_cont, reg_is_cont);
1491   __ jccb(Assembler::notZero, L_thaw);
1492 
1493   // --- call Continuation.enter(Continuation c, boolean isContinue)
1494 
1495   // Make sure the call is patchable
1496   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1497 
1498   // Emit stub for static call
1499   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1500   if (stub == nullptr) {
1501     fatal("CodeCache is full at gen_continuation_enter");
1502   }
1503 
1504   // The call needs to be resolved. There's a special case for this in
1505   // SharedRuntime::find_callee_info_helper() which calls
1506   // LinkResolver::resolve_continuation_enter() which resolves the call to
1507   // Continuation.enter(Continuation c, boolean isContinue).
1508   __ call(resolve);
1509 
1510   oop_maps->add_gc_map(__ pc() - start, map);
1511   __ post_call_nop();
1512 
1513   __ jmpb(L_exit);
1514 
1515   // --- Thawing path
1516 
1517   __ bind(L_thaw);
1518 
1519   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1520   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1521 
1522   ContinuationEntry::_return_pc_offset = __ pc() - start;
1523   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1524   __ post_call_nop();
1525 
1526   // --- Normal exit (resolve/thawing)
1527 
1528   __ bind(L_exit);
1529   ContinuationEntry::_cleanup_offset = __ pc() - start;
1530   continuation_enter_cleanup(masm);
1531   __ pop(rbp);
1532   __ ret(0);
1533 
1534   // --- Exception handling path
1535 
1536   exception_offset = __ pc() - start;
1537 
1538   continuation_enter_cleanup(masm);
1539   __ pop(rbp);
1540 
1541   __ movptr(c_rarg0, r15_thread);
1542   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1543 
1544   // rax still holds the original exception oop, save it before the call
1545   __ push(rax);
1546 
1547   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1548   __ movptr(rbx, rax);
1549 
1550   // Continue at exception handler:
1551   //   rax: exception oop
1552   //   rbx: exception handler
1553   //   rdx: exception pc
1554   __ pop(rax);
1555   __ verify_oop(rax);
1556   __ pop(rdx);
1557   __ jmp(rbx);
1558 }
1559 
1560 static void gen_continuation_yield(MacroAssembler* masm,
1561                                    const VMRegPair* regs,
1562                                    OopMapSet* oop_maps,
1563                                    int& frame_complete,
1564                                    int& stack_slots,
1565                                    int& compiled_entry_offset) {
1566   enum layout {
1567     rbp_off,
1568     rbpH_off,
1569     return_off,
1570     return_off2,
1571     framesize // inclusive of return address
1572   };
1573   stack_slots = framesize /  VMRegImpl::slots_per_word;
1574   assert(stack_slots == 2, "recheck layout");
1575 
1576   address start = __ pc();
1577   compiled_entry_offset = __ pc() - start;
1578   __ enter();
1579   address the_pc = __ pc();
1580 
1581   frame_complete = the_pc - start;
1582 
1583   // This nop must be exactly at the PC we push into the frame info.
1584   // We use this nop for fast CodeBlob lookup, associate the OopMap
1585   // with it right away.
1586   __ post_call_nop();
1587   OopMap* map = new OopMap(framesize, 1);
1588   oop_maps->add_gc_map(frame_complete, map);
1589 
1590   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1591   __ movptr(c_rarg0, r15_thread);
1592   __ movptr(c_rarg1, rsp);
1593   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1594   __ reset_last_Java_frame(true);
1595 
1596   Label L_pinned;
1597 
1598   __ testptr(rax, rax);
1599   __ jcc(Assembler::notZero, L_pinned);
1600 
1601   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1602   continuation_enter_cleanup(masm);
1603   __ pop(rbp);
1604   __ ret(0);
1605 
1606   __ bind(L_pinned);
1607 
1608   // Pinned, return to caller
1609 
1610   // handle pending exception thrown by freeze
1611   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1612   Label ok;
1613   __ jcc(Assembler::equal, ok);
1614   __ leave();
1615   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1616   __ bind(ok);
1617 
1618   __ leave();
1619   __ ret(0);
1620 }
1621 
1622 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1623   ::continuation_enter_cleanup(masm);
1624 }
1625 
1626 static void gen_special_dispatch(MacroAssembler* masm,
1627                                  const methodHandle& method,
1628                                  const BasicType* sig_bt,
1629                                  const VMRegPair* regs) {
1630   verify_oop_args(masm, method, sig_bt, regs);
1631   vmIntrinsics::ID iid = method->intrinsic_id();
1632 
1633   // Now write the args into the outgoing interpreter space
1634   bool     has_receiver   = false;
1635   Register receiver_reg   = noreg;
1636   int      member_arg_pos = -1;
1637   Register member_reg     = noreg;
1638   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1639   if (ref_kind != 0) {
1640     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1641     member_reg = rbx;  // known to be free at this point
1642     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1643   } else if (iid == vmIntrinsics::_invokeBasic) {
1644     has_receiver = true;
1645   } else if (iid == vmIntrinsics::_linkToNative) {
1646     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1647     member_reg = rbx;  // known to be free at this point
1648   } else {
1649     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1650   }
1651 
1652   if (member_reg != noreg) {
1653     // Load the member_arg into register, if necessary.
1654     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1655     VMReg r = regs[member_arg_pos].first();
1656     if (r->is_stack()) {
1657       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1658     } else {
1659       // no data motion is needed
1660       member_reg = r->as_Register();
1661     }
1662   }
1663 
1664   if (has_receiver) {
1665     // Make sure the receiver is loaded into a register.
1666     assert(method->size_of_parameters() > 0, "oob");
1667     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1668     VMReg r = regs[0].first();
1669     assert(r->is_valid(), "bad receiver arg");
1670     if (r->is_stack()) {
1671       // Porting note:  This assumes that compiled calling conventions always
1672       // pass the receiver oop in a register.  If this is not true on some
1673       // platform, pick a temp and load the receiver from stack.
1674       fatal("receiver always in a register");
1675       receiver_reg = j_rarg0;  // known to be free at this point
1676       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1677     } else {
1678       // no data motion is needed
1679       receiver_reg = r->as_Register();
1680     }
1681   }
1682 
1683   // Figure out which address we are really jumping to:
1684   MethodHandles::generate_method_handle_dispatch(masm, iid,
1685                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1686 }
1687 
1688 // ---------------------------------------------------------------------------
1689 // Generate a native wrapper for a given method.  The method takes arguments
1690 // in the Java compiled code convention, marshals them to the native
1691 // convention (handlizes oops, etc), transitions to native, makes the call,
1692 // returns to java state (possibly blocking), unhandlizes any result and
1693 // returns.
1694 //
1695 // Critical native functions are a shorthand for the use of
1696 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1697 // functions.  The wrapper is expected to unpack the arguments before
1698 // passing them to the callee. Critical native functions leave the state _in_Java,
1699 // since they cannot stop for GC.
1700 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1701 // block and the check for pending exceptions it's impossible for them
1702 // to be thrown.
1703 //
1704 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1705                                                 const methodHandle& method,
1706                                                 int compile_id,
1707                                                 BasicType* in_sig_bt,
1708                                                 VMRegPair* in_regs,
1709                                                 BasicType ret_type) {
1710   if (method->is_continuation_native_intrinsic()) {
1711     int exception_offset = -1;
1712     OopMapSet* oop_maps = new OopMapSet();
1713     int frame_complete = -1;
1714     int stack_slots = -1;
1715     int interpreted_entry_offset = -1;
1716     int vep_offset = -1;
1717     if (method->is_continuation_enter_intrinsic()) {
1718       gen_continuation_enter(masm,
1719                              in_regs,
1720                              exception_offset,
1721                              oop_maps,
1722                              frame_complete,
1723                              stack_slots,
1724                              interpreted_entry_offset,
1725                              vep_offset);
1726     } else if (method->is_continuation_yield_intrinsic()) {
1727       gen_continuation_yield(masm,
1728                              in_regs,
1729                              oop_maps,
1730                              frame_complete,
1731                              stack_slots,
1732                              vep_offset);
1733     } else {
1734       guarantee(false, "Unknown Continuation native intrinsic");
1735     }
1736 
1737 #ifdef ASSERT
1738     if (method->is_continuation_enter_intrinsic()) {
1739       assert(interpreted_entry_offset != -1, "Must be set");
1740       assert(exception_offset != -1,         "Must be set");
1741     } else {
1742       assert(interpreted_entry_offset == -1, "Must be unset");
1743       assert(exception_offset == -1,         "Must be unset");
1744     }
1745     assert(frame_complete != -1,    "Must be set");
1746     assert(stack_slots != -1,       "Must be set");
1747     assert(vep_offset != -1,        "Must be set");
1748 #endif
1749 
1750     __ flush();
1751     nmethod* nm = nmethod::new_native_nmethod(method,
1752                                               compile_id,
1753                                               masm->code(),
1754                                               vep_offset,
1755                                               frame_complete,
1756                                               stack_slots,
1757                                               in_ByteSize(-1),
1758                                               in_ByteSize(-1),
1759                                               oop_maps,
1760                                               exception_offset);
1761     if (nm == nullptr) return nm;
1762     if (method->is_continuation_enter_intrinsic()) {
1763       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1764     } else if (method->is_continuation_yield_intrinsic()) {
1765       _cont_doYield_stub = nm;
1766     }
1767     return nm;
1768   }
1769 
1770   if (method->is_method_handle_intrinsic()) {
1771     vmIntrinsics::ID iid = method->intrinsic_id();
1772     intptr_t start = (intptr_t)__ pc();
1773     int vep_offset = ((intptr_t)__ pc()) - start;
1774     gen_special_dispatch(masm,
1775                          method,
1776                          in_sig_bt,
1777                          in_regs);
1778     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1779     __ flush();
1780     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1781     return nmethod::new_native_nmethod(method,
1782                                        compile_id,
1783                                        masm->code(),
1784                                        vep_offset,
1785                                        frame_complete,
1786                                        stack_slots / VMRegImpl::slots_per_word,
1787                                        in_ByteSize(-1),
1788                                        in_ByteSize(-1),
1789                                        nullptr);
1790   }
1791   address native_func = method->native_function();
1792   assert(native_func != nullptr, "must have function");
1793 
1794   // An OopMap for lock (and class if static)
1795   OopMapSet *oop_maps = new OopMapSet();
1796   intptr_t start = (intptr_t)__ pc();
1797 
1798   // We have received a description of where all the java arg are located
1799   // on entry to the wrapper. We need to convert these args to where
1800   // the jni function will expect them. To figure out where they go
1801   // we convert the java signature to a C signature by inserting
1802   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1803 
1804   const int total_in_args = method->size_of_parameters();
1805   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1806 
1807   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1808   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1809 
1810   int argc = 0;
1811   out_sig_bt[argc++] = T_ADDRESS;
1812   if (method->is_static()) {
1813     out_sig_bt[argc++] = T_OBJECT;
1814   }
1815 
1816   for (int i = 0; i < total_in_args ; i++ ) {
1817     out_sig_bt[argc++] = in_sig_bt[i];
1818   }
1819 
1820   // Now figure out where the args must be stored and how much stack space
1821   // they require.
1822   int out_arg_slots;
1823   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1824 
1825   // Compute framesize for the wrapper.  We need to handlize all oops in
1826   // incoming registers
1827 
1828   // Calculate the total number of stack slots we will need.
1829 
1830   // First count the abi requirement plus all of the outgoing args
1831   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1832 
1833   // Now the space for the inbound oop handle area
1834   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1835 
1836   int oop_handle_offset = stack_slots;
1837   stack_slots += total_save_slots;
1838 
1839   // Now any space we need for handlizing a klass if static method
1840 
1841   int klass_slot_offset = 0;
1842   int klass_offset = -1;
1843   int lock_slot_offset = 0;
1844   bool is_static = false;
1845 
1846   if (method->is_static()) {
1847     klass_slot_offset = stack_slots;
1848     stack_slots += VMRegImpl::slots_per_word;
1849     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1850     is_static = true;
1851   }
1852 
1853   // Plus a lock if needed
1854 
1855   if (method->is_synchronized()) {
1856     lock_slot_offset = stack_slots;
1857     stack_slots += VMRegImpl::slots_per_word;
1858   }
1859 
1860   // Now a place (+2) to save return values or temp during shuffling
1861   // + 4 for return address (which we own) and saved rbp
1862   stack_slots += 6;
1863 
1864   // Ok The space we have allocated will look like:
1865   //
1866   //
1867   // FP-> |                     |
1868   //      |---------------------|
1869   //      | 2 slots for moves   |
1870   //      |---------------------|
1871   //      | lock box (if sync)  |
1872   //      |---------------------| <- lock_slot_offset
1873   //      | klass (if static)   |
1874   //      |---------------------| <- klass_slot_offset
1875   //      | oopHandle area      |
1876   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1877   //      | outbound memory     |
1878   //      | based arguments     |
1879   //      |                     |
1880   //      |---------------------|
1881   //      |                     |
1882   // SP-> | out_preserved_slots |
1883   //
1884   //
1885 
1886 
1887   // Now compute actual number of stack words we need rounding to make
1888   // stack properly aligned.
1889   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1890 
1891   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1892 
1893   // First thing make an ic check to see if we should even be here
1894 
1895   // We are free to use all registers as temps without saving them and
1896   // restoring them except rbp. rbp is the only callee save register
1897   // as far as the interpreter and the compiler(s) are concerned.
1898 
1899   const Register receiver = j_rarg0;
1900 
1901   Label exception_pending;
1902 
1903   assert_different_registers(receiver, rscratch1, rscratch2);
1904   __ verify_oop(receiver);
1905   __ ic_check(8 /* end_alignment */);
1906 
1907   int vep_offset = ((intptr_t)__ pc()) - start;
1908 
1909   if (method->needs_clinit_barrier()) {
1910     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
1911     Label L_skip_barrier;
1912     Register klass = r10;
1913     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1914     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1915 
1916     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1917 
1918     __ bind(L_skip_barrier);
1919   }
1920 
1921 #ifdef COMPILER1
1922   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1923   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1924     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1925   }
1926 #endif // COMPILER1
1927 
1928   // The instruction at the verified entry point must be 5 bytes or longer
1929   // because it can be patched on the fly by make_non_entrant. The stack bang
1930   // instruction fits that requirement.
1931 
1932   // Generate stack overflow check
1933   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1934 
1935   // Generate a new frame for the wrapper.
1936   __ enter();
1937   // -2 because return address is already present and so is saved rbp
1938   __ subptr(rsp, stack_size - 2*wordSize);
1939 
1940   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1941   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1942   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1943 
1944   // Frame is now completed as far as size and linkage.
1945   int frame_complete = ((intptr_t)__ pc()) - start;
1946 
1947 #ifdef ASSERT
1948   __ check_stack_alignment(rsp, "improperly aligned stack");
1949 #endif /* ASSERT */
1950 
1951 
1952   // We use r14 as the oop handle for the receiver/klass
1953   // It is callee save so it survives the call to native
1954 
1955   const Register oop_handle_reg = r14;
1956 
1957   //
1958   // We immediately shuffle the arguments so that any vm call we have to
1959   // make from here on out (sync slow path, jvmti, etc.) we will have
1960   // captured the oops from our caller and have a valid oopMap for
1961   // them.
1962 
1963   // -----------------
1964   // The Grand Shuffle
1965 
1966   // The Java calling convention is either equal (linux) or denser (win64) than the
1967   // c calling convention. However the because of the jni_env argument the c calling
1968   // convention always has at least one more (and two for static) arguments than Java.
1969   // Therefore if we move the args from java -> c backwards then we will never have
1970   // a register->register conflict and we don't have to build a dependency graph
1971   // and figure out how to break any cycles.
1972   //
1973 
1974   // Record esp-based slot for receiver on stack for non-static methods
1975   int receiver_offset = -1;
1976 
1977   // This is a trick. We double the stack slots so we can claim
1978   // the oops in the caller's frame. Since we are sure to have
1979   // more args than the caller doubling is enough to make
1980   // sure we can capture all the incoming oop args from the
1981   // caller.
1982   //
1983   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1984 
1985   // Mark location of rbp (someday)
1986   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1987 
1988   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1989   // All inbound args are referenced based on rbp and all outbound args via rsp.
1990 
1991 
1992 #ifdef ASSERT
1993   bool reg_destroyed[Register::number_of_registers];
1994   bool freg_destroyed[XMMRegister::number_of_registers];
1995   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1996     reg_destroyed[r] = false;
1997   }
1998   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1999     freg_destroyed[f] = false;
2000   }
2001 
2002 #endif /* ASSERT */
2003 
2004   // For JNI natives the incoming and outgoing registers are offset upwards.
2005   GrowableArray<int> arg_order(2 * total_in_args);
2006 
2007   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2008     arg_order.push(i);
2009     arg_order.push(c_arg);
2010   }
2011 
2012   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2013     int i = arg_order.at(ai);
2014     int c_arg = arg_order.at(ai + 1);
2015     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2016 #ifdef ASSERT
2017     if (in_regs[i].first()->is_Register()) {
2018       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2019     } else if (in_regs[i].first()->is_XMMRegister()) {
2020       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2021     }
2022     if (out_regs[c_arg].first()->is_Register()) {
2023       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2024     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2025       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2026     }
2027 #endif /* ASSERT */
2028     switch (in_sig_bt[i]) {
2029       case T_ARRAY:
2030       case T_OBJECT:
2031         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2032                     ((i == 0) && (!is_static)),
2033                     &receiver_offset);
2034         break;
2035       case T_VOID:
2036         break;
2037 
2038       case T_FLOAT:
2039         __ float_move(in_regs[i], out_regs[c_arg]);
2040           break;
2041 
2042       case T_DOUBLE:
2043         assert( i + 1 < total_in_args &&
2044                 in_sig_bt[i + 1] == T_VOID &&
2045                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2046         __ double_move(in_regs[i], out_regs[c_arg]);
2047         break;
2048 
2049       case T_LONG :
2050         __ long_move(in_regs[i], out_regs[c_arg]);
2051         break;
2052 
2053       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2054 
2055       default:
2056         __ move32_64(in_regs[i], out_regs[c_arg]);
2057     }
2058   }
2059 
2060   int c_arg;
2061 
2062   // Pre-load a static method's oop into r14.  Used both by locking code and
2063   // the normal JNI call code.
2064   // point c_arg at the first arg that is already loaded in case we
2065   // need to spill before we call out
2066   c_arg = total_c_args - total_in_args;
2067 
2068   if (method->is_static()) {
2069 
2070     //  load oop into a register
2071     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2072 
2073     // Now handlize the static class mirror it's known not-null.
2074     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2075     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2076 
2077     // Now get the handle
2078     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2079     // store the klass handle as second argument
2080     __ movptr(c_rarg1, oop_handle_reg);
2081     // and protect the arg if we must spill
2082     c_arg--;
2083   }
2084 
2085   // Change state to native (we save the return address in the thread, since it might not
2086   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2087   // points into the right code segment. It does not have to be the correct return pc.
2088   // We use the same pc/oopMap repeatedly when we call out
2089 
2090   Label native_return;
2091   if (method->is_object_wait0()) {
2092     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2093     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2094   } else {
2095     intptr_t the_pc = (intptr_t) __ pc();
2096     oop_maps->add_gc_map(the_pc - start, map);
2097 
2098     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2099   }
2100 
2101   // We have all of the arguments setup at this point. We must not touch any register
2102   // argument registers at this point (what if we save/restore them there are no oop?
2103 
2104   if (DTraceMethodProbes) {
2105     // protect the args we've loaded
2106     save_args(masm, total_c_args, c_arg, out_regs);
2107     __ mov_metadata(c_rarg1, method());
2108     __ call_VM_leaf(
2109       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2110       r15_thread, c_rarg1);
2111     restore_args(masm, total_c_args, c_arg, out_regs);
2112   }
2113 
2114   // RedefineClasses() tracing support for obsolete method entry
2115   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2116     // protect the args we've loaded
2117     save_args(masm, total_c_args, c_arg, out_regs);
2118     __ mov_metadata(c_rarg1, method());
2119     __ call_VM_leaf(
2120       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2121       r15_thread, c_rarg1);
2122     restore_args(masm, total_c_args, c_arg, out_regs);
2123   }
2124 
2125   // Lock a synchronized method
2126 
2127   // Register definitions used by locking and unlocking
2128 
2129   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2130   const Register obj_reg  = rbx;  // Will contain the oop
2131   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2132 
2133   Label slow_path_lock;
2134   Label lock_done;
2135 
2136   if (method->is_synchronized()) {
2137     // Get the handle (the 2nd argument)
2138     __ mov(oop_handle_reg, c_rarg1);
2139 
2140     // Get address of the box
2141 
2142     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2143 
2144     // Load the oop from the handle
2145     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2146 
2147     __ fast_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2148 
2149     // Slow path will re-enter here
2150     __ bind(lock_done);
2151   }
2152 
2153   // Finally just about ready to make the JNI call
2154 
2155   // get JNIEnv* which is first argument to native
2156   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2157 
2158   // Now set thread in native
2159   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2160 
2161   __ call(RuntimeAddress(native_func));
2162 
2163   // Verify or restore cpu control state after JNI call
2164   __ restore_cpu_control_state_after_jni(rscratch1);
2165 
2166   // Unpack native results.
2167   switch (ret_type) {
2168   case T_BOOLEAN: __ c2bool(rax);            break;
2169   case T_CHAR   : __ movzwl(rax, rax);      break;
2170   case T_BYTE   : __ sign_extend_byte (rax); break;
2171   case T_SHORT  : __ sign_extend_short(rax); break;
2172   case T_INT    : /* nothing to do */        break;
2173   case T_DOUBLE :
2174   case T_FLOAT  :
2175     // Result is in xmm0 we'll save as needed
2176     break;
2177   case T_ARRAY:                 // Really a handle
2178   case T_OBJECT:                // Really a handle
2179       break; // can't de-handlize until after safepoint check
2180   case T_VOID: break;
2181   case T_LONG: break;
2182   default       : ShouldNotReachHere();
2183   }
2184 
2185   // Switch thread to "native transition" state before reading the synchronization state.
2186   // This additional state is necessary because reading and testing the synchronization
2187   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2188   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2189   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2190   //     Thread A is resumed to finish this native method, but doesn't block here since it
2191   //     didn't see any synchronization is progress, and escapes.
2192   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2193 
2194   // Force this write out before the read below
2195   if (!UseSystemMemoryBarrier) {
2196     __ membar(Assembler::Membar_mask_bits(
2197               Assembler::LoadLoad | Assembler::LoadStore |
2198               Assembler::StoreLoad | Assembler::StoreStore));
2199   }
2200 
2201   // check for safepoint operation in progress and/or pending suspend requests
2202   {
2203     Label Continue;
2204     Label slow_path;
2205 
2206     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2207 
2208     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2209     __ jcc(Assembler::equal, Continue);
2210     __ bind(slow_path);
2211 
2212     // Don't use call_VM as it will see a possible pending exception and forward it
2213     // and never return here preventing us from clearing _last_native_pc down below.
2214     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2215     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2216     // by hand.
2217     //
2218     __ vzeroupper();
2219     save_native_result(masm, ret_type, stack_slots);
2220     __ mov(c_rarg0, r15_thread);
2221     __ mov(r12, rsp); // remember sp
2222     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2223     __ andptr(rsp, -16); // align stack as required by ABI
2224     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2225     __ mov(rsp, r12); // restore sp
2226     __ reinit_heapbase();
2227     // Restore any method result value
2228     restore_native_result(masm, ret_type, stack_slots);
2229     __ bind(Continue);
2230   }
2231 
2232   // change thread state
2233   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2234 
2235   if (method->is_object_wait0()) {
2236     // Check preemption for Object.wait()
2237     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2238     __ cmpptr(rscratch1, NULL_WORD);
2239     __ jccb(Assembler::equal, native_return);
2240     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2241     __ jmp(rscratch1);
2242     __ bind(native_return);
2243 
2244     intptr_t the_pc = (intptr_t) __ pc();
2245     oop_maps->add_gc_map(the_pc - start, map);
2246   }
2247 
2248 
2249   Label reguard;
2250   Label reguard_done;
2251   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2252   __ jcc(Assembler::equal, reguard);
2253   __ bind(reguard_done);
2254 
2255   // native result if any is live
2256 
2257   // Unlock
2258   Label slow_path_unlock;
2259   Label unlock_done;
2260   if (method->is_synchronized()) {
2261 
2262     Label fast_done;
2263 
2264     // Get locked oop from the handle we passed to jni
2265     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2266 
2267     // Must save rax if it is live now because cmpxchg must use it
2268     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2269       save_native_result(masm, ret_type, stack_slots);
2270     }
2271 
2272     __ fast_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2273 
2274     // slow path re-enters here
2275     __ bind(unlock_done);
2276     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2277       restore_native_result(masm, ret_type, stack_slots);
2278     }
2279 
2280     __ bind(fast_done);
2281   }
2282   if (DTraceMethodProbes) {
2283     save_native_result(masm, ret_type, stack_slots);
2284     __ mov_metadata(c_rarg1, method());
2285     __ call_VM_leaf(
2286          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2287          r15_thread, c_rarg1);
2288     restore_native_result(masm, ret_type, stack_slots);
2289   }
2290 
2291   __ reset_last_Java_frame(false);
2292 
2293   // Unbox oop result, e.g. JNIHandles::resolve value.
2294   if (is_reference_type(ret_type)) {
2295     __ resolve_jobject(rax /* value */,
2296                        rcx /* tmp */);
2297   }
2298 
2299   if (CheckJNICalls) {
2300     // clear_pending_jni_exception_check
2301     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2302   }
2303 
2304   // reset handle block
2305   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2306   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2307 
2308   // pop our frame
2309 
2310   __ leave();
2311 
2312 #if INCLUDE_JFR
2313   // We need to do a poll test after unwind in case the sampler
2314   // managed to sample the native frame after returning to Java.
2315   Label L_return;
2316   address poll_test_pc = __ pc();
2317   __ relocate(relocInfo::poll_return_type);
2318   __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2319   __ jccb(Assembler::zero, L_return);
2320   __ lea(rscratch1, InternalAddress(poll_test_pc));
2321   __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2322   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2323     "polling page return stub not created yet");
2324   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2325   __ jump(RuntimeAddress(stub));
2326   __ bind(L_return);
2327 #endif // INCLUDE_JFR
2328 
2329   // Any exception pending?
2330   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2331   __ jcc(Assembler::notEqual, exception_pending);
2332 
2333   // Return
2334 
2335   __ ret(0);
2336 
2337   // Unexpected paths are out of line and go here
2338 
2339   // forward the exception
2340   __ bind(exception_pending);
2341 
2342   // and forward the exception
2343   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2344 
2345   // Slow path locking & unlocking
2346   if (method->is_synchronized()) {
2347 
2348     // BEGIN Slow path lock
2349     __ bind(slow_path_lock);
2350 
2351     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2352     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2353 
2354     // protect the args we've loaded
2355     save_args(masm, total_c_args, c_arg, out_regs);
2356 
2357     __ mov(c_rarg0, obj_reg);
2358     __ mov(c_rarg1, lock_reg);
2359     __ mov(c_rarg2, r15_thread);
2360 
2361     // Not a leaf but we have last_Java_frame setup as we want.
2362     // We don't want to unmount in case of contention since that would complicate preserving
2363     // the arguments that had already been marshalled into the native convention. So we force
2364     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2365     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2366     __ push_cont_fastpath();
2367     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2368     __ pop_cont_fastpath();
2369     restore_args(masm, total_c_args, c_arg, out_regs);
2370 
2371 #ifdef ASSERT
2372     { Label L;
2373     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2374     __ jcc(Assembler::equal, L);
2375     __ stop("no pending exception allowed on exit from monitorenter");
2376     __ bind(L);
2377     }
2378 #endif
2379     __ jmp(lock_done);
2380 
2381     // END Slow path lock
2382 
2383     // BEGIN Slow path unlock
2384     __ bind(slow_path_unlock);
2385 
2386     // If we haven't already saved the native result we must save it now as xmm registers
2387     // are still exposed.
2388     __ vzeroupper();
2389     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2390       save_native_result(masm, ret_type, stack_slots);
2391     }
2392 
2393     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2394 
2395     __ mov(c_rarg0, obj_reg);
2396     __ mov(c_rarg2, r15_thread);
2397     __ mov(r12, rsp); // remember sp
2398     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2399     __ andptr(rsp, -16); // align stack as required by ABI
2400 
2401     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2402     // NOTE that obj_reg == rbx currently
2403     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2404     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2405 
2406     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2407     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2408     __ mov(rsp, r12); // restore sp
2409     __ reinit_heapbase();
2410 #ifdef ASSERT
2411     {
2412       Label L;
2413       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2414       __ jcc(Assembler::equal, L);
2415       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2416       __ bind(L);
2417     }
2418 #endif /* ASSERT */
2419 
2420     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2421 
2422     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2423       restore_native_result(masm, ret_type, stack_slots);
2424     }
2425     __ jmp(unlock_done);
2426 
2427     // END Slow path unlock
2428 
2429   } // synchronized
2430 
2431   // SLOW PATH Reguard the stack if needed
2432 
2433   __ bind(reguard);
2434   __ vzeroupper();
2435   save_native_result(masm, ret_type, stack_slots);
2436   __ mov(r12, rsp); // remember sp
2437   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2438   __ andptr(rsp, -16); // align stack as required by ABI
2439   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2440   __ mov(rsp, r12); // restore sp
2441   __ reinit_heapbase();
2442   restore_native_result(masm, ret_type, stack_slots);
2443   // and continue
2444   __ jmp(reguard_done);
2445 
2446 
2447 
2448   __ flush();
2449 
2450   nmethod *nm = nmethod::new_native_nmethod(method,
2451                                             compile_id,
2452                                             masm->code(),
2453                                             vep_offset,
2454                                             frame_complete,
2455                                             stack_slots / VMRegImpl::slots_per_word,
2456                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2457                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2458                                             oop_maps);
2459 
2460   return nm;
2461 }
2462 
2463 // this function returns the adjust size (in number of words) to a c2i adapter
2464 // activation for use during deoptimization
2465 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2466   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2467 }
2468 
2469 
2470 uint SharedRuntime::out_preserve_stack_slots() {
2471   return 0;
2472 }
2473 
2474 
2475 // Number of stack slots between incoming argument block and the start of
2476 // a new frame.  The PROLOG must add this many slots to the stack.  The
2477 // EPILOG must remove this many slots.  amd64 needs two slots for
2478 // return address.
2479 uint SharedRuntime::in_preserve_stack_slots() {
2480   return 4 + 2 * VerifyStackAtCalls;
2481 }
2482 
2483 VMReg SharedRuntime::thread_register() {
2484   return r15_thread->as_VMReg();
2485 }
2486 
2487 //------------------------------generate_deopt_blob----------------------------
2488 void SharedRuntime::generate_deopt_blob() {
2489   // Allocate space for the code
2490   ResourceMark rm;
2491   // Setup code generation tools
2492   int pad = 0;
2493   if (UseAVX > 2) {
2494     pad += 1024;
2495   }
2496   if (UseAPX) {
2497     pad += 1024;
2498   }
2499 #if INCLUDE_JVMCI
2500   if (EnableJVMCI) {
2501     pad += 512; // Increase the buffer size when compiling for JVMCI
2502   }
2503 #endif
2504   const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2505   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2506   if (blob != nullptr) {
2507     _deopt_blob = blob->as_deoptimization_blob();
2508     return;
2509   }
2510 
2511   CodeBuffer buffer(name, 2560+pad, 1024);
2512   MacroAssembler* masm = new MacroAssembler(&buffer);
2513   int frame_size_in_words;
2514   OopMap* map = nullptr;
2515   OopMapSet *oop_maps = new OopMapSet();
2516 
2517   // -------------
2518   // This code enters when returning to a de-optimized nmethod.  A return
2519   // address has been pushed on the stack, and return values are in
2520   // registers.
2521   // If we are doing a normal deopt then we were called from the patched
2522   // nmethod from the point we returned to the nmethod. So the return
2523   // address on the stack is wrong by NativeCall::instruction_size
2524   // We will adjust the value so it looks like we have the original return
2525   // address on the stack (like when we eagerly deoptimized).
2526   // In the case of an exception pending when deoptimizing, we enter
2527   // with a return address on the stack that points after the call we patched
2528   // into the exception handler. We have the following register state from,
2529   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2530   //    rax: exception oop
2531   //    rbx: exception handler
2532   //    rdx: throwing pc
2533   // So in this case we simply jam rdx into the useless return address and
2534   // the stack looks just like we want.
2535   //
2536   // At this point we need to de-opt.  We save the argument return
2537   // registers.  We call the first C routine, fetch_unroll_info().  This
2538   // routine captures the return values and returns a structure which
2539   // describes the current frame size and the sizes of all replacement frames.
2540   // The current frame is compiled code and may contain many inlined
2541   // functions, each with their own JVM state.  We pop the current frame, then
2542   // push all the new frames.  Then we call the C routine unpack_frames() to
2543   // populate these frames.  Finally unpack_frames() returns us the new target
2544   // address.  Notice that callee-save registers are BLOWN here; they have
2545   // already been captured in the vframeArray at the time the return PC was
2546   // patched.
2547   address start = __ pc();
2548   Label cont;
2549 
2550   // Prolog for non exception case!
2551 
2552   // Save everything in sight.
2553   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2554 
2555   // Normal deoptimization.  Save exec mode for unpack_frames.
2556   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2557   __ jmp(cont);
2558 
2559   int reexecute_offset = __ pc() - start;
2560 #if INCLUDE_JVMCI && !defined(COMPILER1)
2561   if (UseJVMCICompiler) {
2562     // JVMCI does not use this kind of deoptimization
2563     __ should_not_reach_here();
2564   }
2565 #endif
2566 
2567   // Reexecute case
2568   // return address is the pc describes what bci to do re-execute at
2569 
2570   // No need to update map as each call to save_live_registers will produce identical oopmap
2571   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2572 
2573   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2574   __ jmp(cont);
2575 
2576 #if INCLUDE_JVMCI
2577   Label after_fetch_unroll_info_call;
2578   int implicit_exception_uncommon_trap_offset = 0;
2579   int uncommon_trap_offset = 0;
2580 
2581   if (EnableJVMCI) {
2582     implicit_exception_uncommon_trap_offset = __ pc() - start;
2583 
2584     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2585     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2586 
2587     uncommon_trap_offset = __ pc() - start;
2588 
2589     // Save everything in sight.
2590     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2591     // fetch_unroll_info needs to call last_java_frame()
2592     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2593 
2594     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2595     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2596 
2597     __ movl(r14, Deoptimization::Unpack_reexecute);
2598     __ mov(c_rarg0, r15_thread);
2599     __ movl(c_rarg2, r14); // exec mode
2600     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2601     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2602 
2603     __ reset_last_Java_frame(false);
2604 
2605     __ jmp(after_fetch_unroll_info_call);
2606   } // EnableJVMCI
2607 #endif // INCLUDE_JVMCI
2608 
2609   int exception_offset = __ pc() - start;
2610 
2611   // Prolog for exception case
2612 
2613   // all registers are dead at this entry point, except for rax, and
2614   // rdx which contain the exception oop and exception pc
2615   // respectively.  Set them in TLS and fall thru to the
2616   // unpack_with_exception_in_tls entry point.
2617 
2618   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2619   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2620 
2621   int exception_in_tls_offset = __ pc() - start;
2622 
2623   // new implementation because exception oop is now passed in JavaThread
2624 
2625   // Prolog for exception case
2626   // All registers must be preserved because they might be used by LinearScan
2627   // Exceptiop oop and throwing PC are passed in JavaThread
2628   // tos: stack at point of call to method that threw the exception (i.e. only
2629   // args are on the stack, no return address)
2630 
2631   // make room on stack for the return address
2632   // It will be patched later with the throwing pc. The correct value is not
2633   // available now because loading it from memory would destroy registers.
2634   __ push(0);
2635 
2636   // Save everything in sight.
2637   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2638 
2639   // Now it is safe to overwrite any register
2640 
2641   // Deopt during an exception.  Save exec mode for unpack_frames.
2642   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2643 
2644   // load throwing pc from JavaThread and patch it as the return address
2645   // of the current frame. Then clear the field in JavaThread
2646 
2647   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2648   __ movptr(Address(rbp, wordSize), rdx);
2649   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2650 
2651 #ifdef ASSERT
2652   // verify that there is really an exception oop in JavaThread
2653   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2654   __ verify_oop(rax);
2655 
2656   // verify that there is no pending exception
2657   Label no_pending_exception;
2658   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2659   __ testptr(rax, rax);
2660   __ jcc(Assembler::zero, no_pending_exception);
2661   __ stop("must not have pending exception here");
2662   __ bind(no_pending_exception);
2663 #endif
2664 
2665   __ bind(cont);
2666 
2667   // Call C code.  Need thread and this frame, but NOT official VM entry
2668   // crud.  We cannot block on this call, no GC can happen.
2669   //
2670   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2671 
2672   // fetch_unroll_info needs to call last_java_frame().
2673 
2674   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2675 #ifdef ASSERT
2676   { Label L;
2677     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2678     __ jcc(Assembler::equal, L);
2679     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2680     __ bind(L);
2681   }
2682 #endif // ASSERT
2683   __ mov(c_rarg0, r15_thread);
2684   __ movl(c_rarg1, r14); // exec_mode
2685   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2686 
2687   // Need to have an oopmap that tells fetch_unroll_info where to
2688   // find any register it might need.
2689   oop_maps->add_gc_map(__ pc() - start, map);
2690 
2691   __ reset_last_Java_frame(false);
2692 
2693 #if INCLUDE_JVMCI
2694   if (EnableJVMCI) {
2695     __ bind(after_fetch_unroll_info_call);
2696   }
2697 #endif
2698 
2699   // Load UnrollBlock* into rdi
2700   __ mov(rdi, rax);
2701 
2702   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2703    Label noException;
2704   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2705   __ jcc(Assembler::notEqual, noException);
2706   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2707   // QQQ this is useless it was null above
2708   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2709   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2710   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2711 
2712   __ verify_oop(rax);
2713 
2714   // Overwrite the result registers with the exception results.
2715   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2716   // I think this is useless
2717   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2718 
2719   __ bind(noException);
2720 
2721   // Only register save data is on the stack.
2722   // Now restore the result registers.  Everything else is either dead
2723   // or captured in the vframeArray.
2724   RegisterSaver::restore_result_registers(masm);
2725 
2726   // All of the register save area has been popped of the stack. Only the
2727   // return address remains.
2728 
2729   // Pop all the frames we must move/replace.
2730   //
2731   // Frame picture (youngest to oldest)
2732   // 1: self-frame (no frame link)
2733   // 2: deopting frame  (no frame link)
2734   // 3: caller of deopting frame (could be compiled/interpreted).
2735   //
2736   // Note: by leaving the return address of self-frame on the stack
2737   // and using the size of frame 2 to adjust the stack
2738   // when we are done the return to frame 3 will still be on the stack.
2739 
2740   // Pop deoptimized frame
2741   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2742   __ addptr(rsp, rcx);
2743 
2744   // rsp should be pointing at the return address to the caller (3)
2745 
2746   // Pick up the initial fp we should save
2747   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2748   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2749 
2750 #ifdef ASSERT
2751   // Compilers generate code that bang the stack by as much as the
2752   // interpreter would need. So this stack banging should never
2753   // trigger a fault. Verify that it does not on non product builds.
2754   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2755   __ bang_stack_size(rbx, rcx);
2756 #endif
2757 
2758   // Load address of array of frame pcs into rcx
2759   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2760 
2761   // Trash the old pc
2762   __ addptr(rsp, wordSize);
2763 
2764   // Load address of array of frame sizes into rsi
2765   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2766 
2767   // Load counter into rdx
2768   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2769 
2770   // Now adjust the caller's stack to make up for the extra locals
2771   // but record the original sp so that we can save it in the skeletal interpreter
2772   // frame and the stack walking of interpreter_sender will get the unextended sp
2773   // value and not the "real" sp value.
2774 
2775   const Register sender_sp = r8;
2776 
2777   __ mov(sender_sp, rsp);
2778   __ movl(rbx, Address(rdi,
2779                        Deoptimization::UnrollBlock::
2780                        caller_adjustment_offset()));
2781   __ subptr(rsp, rbx);
2782 
2783   // Push interpreter frames in a loop
2784   Label loop;
2785   __ bind(loop);
2786   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2787   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2788   __ pushptr(Address(rcx, 0));          // Save return address
2789   __ enter();                           // Save old & set new ebp
2790   __ subptr(rsp, rbx);                  // Prolog
2791   // This value is corrected by layout_activation_impl
2792   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2793   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2794   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2795   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2796   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2797   __ decrementl(rdx);                   // Decrement counter
2798   __ jcc(Assembler::notZero, loop);
2799   __ pushptr(Address(rcx, 0));          // Save final return address
2800 
2801   // Re-push self-frame
2802   __ enter();                           // Save old & set new ebp
2803 
2804   // Allocate a full sized register save area.
2805   // Return address and rbp are in place, so we allocate two less words.
2806   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2807 
2808   // Restore frame locals after moving the frame
2809   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2810   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2811 
2812   // Call C code.  Need thread but NOT official VM entry
2813   // crud.  We cannot block on this call, no GC can happen.  Call should
2814   // restore return values to their stack-slots with the new SP.
2815   //
2816   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2817 
2818   // Use rbp because the frames look interpreted now
2819   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2820   // Don't need the precise return PC here, just precise enough to point into this code blob.
2821   address the_pc = __ pc();
2822   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2823 
2824   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2825   __ mov(c_rarg0, r15_thread);
2826   __ movl(c_rarg1, r14); // second arg: exec_mode
2827   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2828   // Revert SP alignment after call since we're going to do some SP relative addressing below
2829   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2830 
2831   // Set an oopmap for the call site
2832   // Use the same PC we used for the last java frame
2833   oop_maps->add_gc_map(the_pc - start,
2834                        new OopMap( frame_size_in_words, 0 ));
2835 
2836   // Clear fp AND pc
2837   __ reset_last_Java_frame(true);
2838 
2839   // Collect return values
2840   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2841   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2842   // I think this is useless (throwing pc?)
2843   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2844 
2845   // Pop self-frame.
2846   __ leave();                           // Epilog
2847 
2848   // Jump to interpreter
2849   __ ret(0);
2850 
2851   // Make sure all code is generated
2852   masm->flush();
2853 
2854   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2855   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2856 #if INCLUDE_JVMCI
2857   if (EnableJVMCI) {
2858     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2859     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2860   }
2861 #endif
2862 
2863   AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2864 }
2865 
2866 //------------------------------generate_handler_blob------
2867 //
2868 // Generate a special Compile2Runtime blob that saves all registers,
2869 // and setup oopmap.
2870 //
2871 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
2872   assert(StubRoutines::forward_exception_entry() != nullptr,
2873          "must be generated before");
2874   assert(is_polling_page_id(id), "expected a polling page stub id");
2875 
2876   // Allocate space for the code.  Setup code generation tools.
2877   const char* name = SharedRuntime::stub_name(id);
2878   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
2879   if (blob != nullptr) {
2880     return blob->as_safepoint_blob();
2881   }
2882 
2883   ResourceMark rm;
2884   OopMapSet *oop_maps = new OopMapSet();
2885   OopMap* map;
2886   CodeBuffer buffer(name, 2548, 1024);
2887   MacroAssembler* masm = new MacroAssembler(&buffer);
2888 
2889   address start   = __ pc();
2890   address call_pc = nullptr;
2891   int frame_size_in_words;
2892   bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
2893   bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
2894 
2895   // Make room for return address (or push it again)
2896   if (!cause_return) {
2897     __ push(rbx);
2898   }
2899 
2900   // Save registers, fpu state, and flags
2901   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
2902 
2903   // The following is basically a call_VM.  However, we need the precise
2904   // address of the call in order to generate an oopmap. Hence, we do all the
2905   // work ourselves.
2906 
2907   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
2908 
2909   // The return address must always be correct so that frame constructor never
2910   // sees an invalid pc.
2911 
2912   if (!cause_return) {
2913     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2914     // Additionally, rbx is a callee saved register and we can look at it later to determine
2915     // if someone changed the return address for us!
2916     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
2917     __ movptr(Address(rbp, wordSize), rbx);
2918   }
2919 
2920   // Do the call
2921   __ mov(c_rarg0, r15_thread);
2922   __ call(RuntimeAddress(call_ptr));
2923 
2924   // Set an oopmap for the call site.  This oopmap will map all
2925   // oop-registers and debug-info registers as callee-saved.  This
2926   // will allow deoptimization at this safepoint to find all possible
2927   // debug-info recordings, as well as let GC find all oops.
2928 
2929   oop_maps->add_gc_map( __ pc() - start, map);
2930 
2931   Label noException;
2932 
2933   __ reset_last_Java_frame(false);
2934 
2935   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
2936   __ jcc(Assembler::equal, noException);
2937 
2938   // Exception pending
2939 
2940   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
2941 
2942   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2943 
2944   // No exception case
2945   __ bind(noException);
2946 
2947   Label no_adjust;
2948 #ifdef ASSERT
2949   Label bail;
2950 #endif
2951   if (!cause_return) {
2952     Label no_prefix, not_special, check_rex_prefix;
2953 
2954     // If our stashed return pc was modified by the runtime we avoid touching it
2955     __ cmpptr(rbx, Address(rbp, wordSize));
2956     __ jcc(Assembler::notEqual, no_adjust);
2957 
2958     // Skip over the poll instruction.
2959     // See NativeInstruction::is_safepoint_poll()
2960     // Possible encodings:
2961     //      85 00       test   %eax,(%rax)
2962     //      85 01       test   %eax,(%rcx)
2963     //      85 02       test   %eax,(%rdx)
2964     //      85 03       test   %eax,(%rbx)
2965     //      85 06       test   %eax,(%rsi)
2966     //      85 07       test   %eax,(%rdi)
2967     //
2968     //   41 85 00       test   %eax,(%r8)
2969     //   41 85 01       test   %eax,(%r9)
2970     //   41 85 02       test   %eax,(%r10)
2971     //   41 85 03       test   %eax,(%r11)
2972     //   41 85 06       test   %eax,(%r14)
2973     //   41 85 07       test   %eax,(%r15)
2974     //
2975     //      85 04 24    test   %eax,(%rsp)
2976     //   41 85 04 24    test   %eax,(%r12)
2977     //      85 45 00    test   %eax,0x0(%rbp)
2978     //   41 85 45 00    test   %eax,0x0(%r13)
2979     //
2980     // Notes:
2981     //  Format of legacy MAP0 test instruction:-
2982     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
2983     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
2984     //     operand and base register of memory operand is b/w [0-8), hence we do not require
2985     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
2986     //     is why two bytes encoding is sufficient here.
2987     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
2988     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
2989     //     there by adding additional byte to instruction encoding.
2990     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
2991     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
2992     //     most significant two bits of 5 bit register encoding.
2993 
2994     if (VM_Version::supports_apx_f()) {
2995       __ cmpb(Address(rbx, 0), Assembler::REX2);
2996       __ jccb(Assembler::notEqual, check_rex_prefix);
2997       __ addptr(rbx, 2);
2998       __ bind(check_rex_prefix);
2999     }
3000     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3001     __ jccb(Assembler::notEqual, no_prefix);
3002     __ addptr(rbx, 1);
3003     __ bind(no_prefix);
3004 #ifdef ASSERT
3005     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3006 #endif
3007     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3008     // r12/rsp 0x04
3009     // r13/rbp 0x05
3010     __ movzbq(rcx, Address(rbx, 1));
3011     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3012     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3013     __ cmpptr(rcx, 1);
3014     __ jccb(Assembler::above, not_special);
3015     __ addptr(rbx, 1);
3016     __ bind(not_special);
3017 #ifdef ASSERT
3018     // Verify the correct encoding of the poll we're about to skip.
3019     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3020     __ jcc(Assembler::notEqual, bail);
3021     // Mask out the modrm bits
3022     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3023     // rax encodes to 0, so if the bits are nonzero it's incorrect
3024     __ jcc(Assembler::notZero, bail);
3025 #endif
3026     // Adjust return pc forward to step over the safepoint poll instruction
3027     __ addptr(rbx, 2);
3028     __ movptr(Address(rbp, wordSize), rbx);
3029   }
3030 
3031   __ bind(no_adjust);
3032   // Normal exit, restore registers and exit.
3033   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3034   __ ret(0);
3035 
3036 #ifdef ASSERT
3037   __ bind(bail);
3038   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3039 #endif
3040 
3041   // Make sure all code is generated
3042   masm->flush();
3043 
3044   // Fill-out other meta info
3045   SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3046 
3047   AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3048   return sp_blob;
3049 }
3050 
3051 //
3052 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3053 //
3054 // Generate a stub that calls into vm to find out the proper destination
3055 // of a java call. All the argument registers are live at this point
3056 // but since this is generic code we don't know what they are and the caller
3057 // must do any gc of the args.
3058 //
3059 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3060   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3061   assert(is_resolve_id(id), "expected a resolve stub id");
3062 
3063   const char* name = SharedRuntime::stub_name(id);
3064   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3065   if (blob != nullptr) {
3066     return blob->as_runtime_stub();
3067   }
3068 
3069   // allocate space for the code
3070   ResourceMark rm;
3071   CodeBuffer buffer(name, 1552, 512);
3072   MacroAssembler* masm = new MacroAssembler(&buffer);
3073 
3074   int frame_size_in_words;
3075 
3076   OopMapSet *oop_maps = new OopMapSet();
3077   OopMap* map = nullptr;
3078 
3079   int start = __ offset();
3080 
3081   // No need to save vector registers since they are caller-saved anyway.
3082   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3083 
3084   int frame_complete = __ offset();
3085 
3086   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3087 
3088   __ mov(c_rarg0, r15_thread);
3089 
3090   __ call(RuntimeAddress(destination));
3091 
3092 
3093   // Set an oopmap for the call site.
3094   // We need this not only for callee-saved registers, but also for volatile
3095   // registers that the compiler might be keeping live across a safepoint.
3096 
3097   oop_maps->add_gc_map( __ offset() - start, map);
3098 
3099   // rax contains the address we are going to jump to assuming no exception got installed
3100 
3101   // clear last_Java_sp
3102   __ reset_last_Java_frame(false);
3103   // check for pending exceptions
3104   Label pending;
3105   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3106   __ jcc(Assembler::notEqual, pending);
3107 
3108   // get the returned Method*
3109   __ get_vm_result_metadata(rbx);
3110   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3111 
3112   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3113 
3114   RegisterSaver::restore_live_registers(masm);
3115 
3116   // We are back to the original state on entry and ready to go.
3117 
3118   __ jmp(rax);
3119 
3120   // Pending exception after the safepoint
3121 
3122   __ bind(pending);
3123 
3124   RegisterSaver::restore_live_registers(masm);
3125 
3126   // exception pending => remove activation and forward to exception handler
3127 
3128   __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3129 
3130   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3131   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3132 
3133   // -------------
3134   // make sure all code is generated
3135   masm->flush();
3136 
3137   // return the  blob
3138   // frame_size_words or bytes??
3139   RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3140 
3141   AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3142   return rs_blob;
3143 }
3144 
3145 // Continuation point for throwing of implicit exceptions that are
3146 // not handled in the current activation. Fabricates an exception
3147 // oop and initiates normal exception dispatching in this
3148 // frame. Since we need to preserve callee-saved values (currently
3149 // only for C2, but done for C1 as well) we need a callee-saved oop
3150 // map and therefore have to make these stubs into RuntimeStubs
3151 // rather than BufferBlobs.  If the compiler needs all registers to
3152 // be preserved between the fault point and the exception handler
3153 // then it must assume responsibility for that in
3154 // AbstractCompiler::continuation_for_implicit_null_exception or
3155 // continuation_for_implicit_division_by_zero_exception. All other
3156 // implicit exceptions (e.g., NullPointerException or
3157 // AbstractMethodError on entry) are either at call sites or
3158 // otherwise assume that stack unwinding will be initiated, so
3159 // caller saved registers were assumed volatile in the compiler.
3160 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3161   assert(is_throw_id(id), "expected a throw stub id");
3162 
3163   const char* name = SharedRuntime::stub_name(id);
3164 
3165   // Information about frame layout at time of blocking runtime call.
3166   // Note that we only have to preserve callee-saved registers since
3167   // the compilers are responsible for supplying a continuation point
3168   // if they expect all registers to be preserved.
3169   enum layout {
3170     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3171     rbp_off2,
3172     return_off,
3173     return_off2,
3174     framesize // inclusive of return address
3175   };
3176 
3177   int insts_size = 512;
3178   int locs_size  = 64;
3179 
3180   const char* timer_msg = "SharedRuntime generate_throw_exception";
3181   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3182 
3183   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3184   if (blob != nullptr) {
3185     return blob->as_runtime_stub();
3186   }
3187 
3188   ResourceMark rm;
3189   CodeBuffer code(name, insts_size, locs_size);
3190   OopMapSet* oop_maps  = new OopMapSet();
3191   MacroAssembler* masm = new MacroAssembler(&code);
3192 
3193   address start = __ pc();
3194 
3195   // This is an inlined and slightly modified version of call_VM
3196   // which has the ability to fetch the return PC out of
3197   // thread-local storage and also sets up last_Java_sp slightly
3198   // differently than the real call_VM
3199 
3200   __ enter(); // required for proper stackwalking of RuntimeStub frame
3201 
3202   assert(is_even(framesize/2), "sp not 16-byte aligned");
3203 
3204   // return address and rbp are already in place
3205   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3206 
3207   int frame_complete = __ pc() - start;
3208 
3209   // Set up last_Java_sp and last_Java_fp
3210   address the_pc = __ pc();
3211   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3212   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3213 
3214   // Call runtime
3215   __ movptr(c_rarg0, r15_thread);
3216   BLOCK_COMMENT("call runtime_entry");
3217   __ call(RuntimeAddress(runtime_entry));
3218 
3219   // Generate oop map
3220   OopMap* map = new OopMap(framesize, 0);
3221 
3222   oop_maps->add_gc_map(the_pc - start, map);
3223 
3224   __ reset_last_Java_frame(true);
3225 
3226   __ leave(); // required for proper stackwalking of RuntimeStub frame
3227 
3228   // check for pending exceptions
3229 #ifdef ASSERT
3230   Label L;
3231   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3232   __ jcc(Assembler::notEqual, L);
3233   __ should_not_reach_here();
3234   __ bind(L);
3235 #endif // ASSERT
3236   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3237 
3238 
3239   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3240   RuntimeStub* stub =
3241     RuntimeStub::new_runtime_stub(name,
3242                                   &code,
3243                                   frame_complete,
3244                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3245                                   oop_maps, false);
3246   AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3247 
3248   return stub;
3249 }
3250 
3251 //------------------------------Montgomery multiplication------------------------
3252 //
3253 
3254 #ifndef _WINDOWS
3255 
3256 // Subtract 0:b from carry:a.  Return carry.
3257 static julong
3258 sub(julong a[], julong b[], julong carry, long len) {
3259   long long i = 0, cnt = len;
3260   julong tmp;
3261   asm volatile("clc; "
3262                "0: ; "
3263                "mov (%[b], %[i], 8), %[tmp]; "
3264                "sbb %[tmp], (%[a], %[i], 8); "
3265                "inc %[i]; dec %[cnt]; "
3266                "jne 0b; "
3267                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3268                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3269                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3270                : "memory");
3271   return tmp;
3272 }
3273 
3274 // Multiply (unsigned) Long A by Long B, accumulating the double-
3275 // length result into the accumulator formed of T0, T1, and T2.
3276 #define MACC(A, B, T0, T1, T2)                                  \
3277 do {                                                            \
3278   unsigned long hi, lo;                                         \
3279   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3280            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3281            : "r"(A), "a"(B) : "cc");                            \
3282  } while(0)
3283 
3284 // As above, but add twice the double-length result into the
3285 // accumulator.
3286 #define MACC2(A, B, T0, T1, T2)                                 \
3287 do {                                                            \
3288   unsigned long hi, lo;                                         \
3289   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3290            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3291            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3292            : "r"(A), "a"(B) : "cc");                            \
3293  } while(0)
3294 
3295 #else //_WINDOWS
3296 
3297 static julong
3298 sub(julong a[], julong b[], julong carry, long len) {
3299   long i;
3300   julong tmp;
3301   unsigned char c = 1;
3302   for (i = 0; i < len; i++) {
3303     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3304     a[i] = tmp;
3305   }
3306   c = _addcarry_u64(c, carry, ~0, &tmp);
3307   return tmp;
3308 }
3309 
3310 // Multiply (unsigned) Long A by Long B, accumulating the double-
3311 // length result into the accumulator formed of T0, T1, and T2.
3312 #define MACC(A, B, T0, T1, T2)                          \
3313 do {                                                    \
3314   julong hi, lo;                            \
3315   lo = _umul128(A, B, &hi);                             \
3316   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3317   c = _addcarry_u64(c, hi, T1, &T1);                    \
3318   _addcarry_u64(c, T2, 0, &T2);                         \
3319  } while(0)
3320 
3321 // As above, but add twice the double-length result into the
3322 // accumulator.
3323 #define MACC2(A, B, T0, T1, T2)                         \
3324 do {                                                    \
3325   julong hi, lo;                            \
3326   lo = _umul128(A, B, &hi);                             \
3327   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3328   c = _addcarry_u64(c, hi, T1, &T1);                    \
3329   _addcarry_u64(c, T2, 0, &T2);                         \
3330   c = _addcarry_u64(0, lo, T0, &T0);                    \
3331   c = _addcarry_u64(c, hi, T1, &T1);                    \
3332   _addcarry_u64(c, T2, 0, &T2);                         \
3333  } while(0)
3334 
3335 #endif //_WINDOWS
3336 
3337 // Fast Montgomery multiplication.  The derivation of the algorithm is
3338 // in  A Cryptographic Library for the Motorola DSP56000,
3339 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3340 
3341 static void NOINLINE
3342 montgomery_multiply(julong a[], julong b[], julong n[],
3343                     julong m[], julong inv, int len) {
3344   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3345   int i;
3346 
3347   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3348 
3349   for (i = 0; i < len; i++) {
3350     int j;
3351     for (j = 0; j < i; j++) {
3352       MACC(a[j], b[i-j], t0, t1, t2);
3353       MACC(m[j], n[i-j], t0, t1, t2);
3354     }
3355     MACC(a[i], b[0], t0, t1, t2);
3356     m[i] = t0 * inv;
3357     MACC(m[i], n[0], t0, t1, t2);
3358 
3359     assert(t0 == 0, "broken Montgomery multiply");
3360 
3361     t0 = t1; t1 = t2; t2 = 0;
3362   }
3363 
3364   for (i = len; i < 2*len; i++) {
3365     int j;
3366     for (j = i-len+1; j < len; j++) {
3367       MACC(a[j], b[i-j], t0, t1, t2);
3368       MACC(m[j], n[i-j], t0, t1, t2);
3369     }
3370     m[i-len] = t0;
3371     t0 = t1; t1 = t2; t2 = 0;
3372   }
3373 
3374   while (t0)
3375     t0 = sub(m, n, t0, len);
3376 }
3377 
3378 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3379 // multiplies so it should be up to 25% faster than Montgomery
3380 // multiplication.  However, its loop control is more complex and it
3381 // may actually run slower on some machines.
3382 
3383 static void NOINLINE
3384 montgomery_square(julong a[], julong n[],
3385                   julong m[], julong inv, int len) {
3386   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3387   int i;
3388 
3389   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3390 
3391   for (i = 0; i < len; i++) {
3392     int j;
3393     int end = (i+1)/2;
3394     for (j = 0; j < end; j++) {
3395       MACC2(a[j], a[i-j], t0, t1, t2);
3396       MACC(m[j], n[i-j], t0, t1, t2);
3397     }
3398     if ((i & 1) == 0) {
3399       MACC(a[j], a[j], t0, t1, t2);
3400     }
3401     for (; j < i; j++) {
3402       MACC(m[j], n[i-j], t0, t1, t2);
3403     }
3404     m[i] = t0 * inv;
3405     MACC(m[i], n[0], t0, t1, t2);
3406 
3407     assert(t0 == 0, "broken Montgomery square");
3408 
3409     t0 = t1; t1 = t2; t2 = 0;
3410   }
3411 
3412   for (i = len; i < 2*len; i++) {
3413     int start = i-len+1;
3414     int end = start + (len - start)/2;
3415     int j;
3416     for (j = start; j < end; j++) {
3417       MACC2(a[j], a[i-j], t0, t1, t2);
3418       MACC(m[j], n[i-j], t0, t1, t2);
3419     }
3420     if ((i & 1) == 0) {
3421       MACC(a[j], a[j], t0, t1, t2);
3422     }
3423     for (; j < len; j++) {
3424       MACC(m[j], n[i-j], t0, t1, t2);
3425     }
3426     m[i-len] = t0;
3427     t0 = t1; t1 = t2; t2 = 0;
3428   }
3429 
3430   while (t0)
3431     t0 = sub(m, n, t0, len);
3432 }
3433 
3434 // Swap words in a longword.
3435 static julong swap(julong x) {
3436   return (x << 32) | (x >> 32);
3437 }
3438 
3439 // Copy len longwords from s to d, word-swapping as we go.  The
3440 // destination array is reversed.
3441 static void reverse_words(julong *s, julong *d, int len) {
3442   d += len;
3443   while(len-- > 0) {
3444     d--;
3445     *d = swap(*s);
3446     s++;
3447   }
3448 }
3449 
3450 // The threshold at which squaring is advantageous was determined
3451 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3452 #define MONTGOMERY_SQUARING_THRESHOLD 64
3453 
3454 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3455                                         jint len, jlong inv,
3456                                         jint *m_ints) {
3457   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3458   int longwords = len/2;
3459 
3460   // Make very sure we don't use so much space that the stack might
3461   // overflow.  512 jints corresponds to an 16384-bit integer and
3462   // will use here a total of 8k bytes of stack space.
3463   int divisor = sizeof(julong) * 4;
3464   guarantee(longwords <= 8192 / divisor, "must be");
3465   int total_allocation = longwords * sizeof (julong) * 4;
3466   julong *scratch = (julong *)alloca(total_allocation);
3467 
3468   // Local scratch arrays
3469   julong
3470     *a = scratch + 0 * longwords,
3471     *b = scratch + 1 * longwords,
3472     *n = scratch + 2 * longwords,
3473     *m = scratch + 3 * longwords;
3474 
3475   reverse_words((julong *)a_ints, a, longwords);
3476   reverse_words((julong *)b_ints, b, longwords);
3477   reverse_words((julong *)n_ints, n, longwords);
3478 
3479   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3480 
3481   reverse_words(m, (julong *)m_ints, longwords);
3482 }
3483 
3484 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3485                                       jint len, jlong inv,
3486                                       jint *m_ints) {
3487   assert(len % 2 == 0, "array length in montgomery_square must be even");
3488   int longwords = len/2;
3489 
3490   // Make very sure we don't use so much space that the stack might
3491   // overflow.  512 jints corresponds to an 16384-bit integer and
3492   // will use here a total of 6k bytes of stack space.
3493   int divisor = sizeof(julong) * 3;
3494   guarantee(longwords <= (8192 / divisor), "must be");
3495   int total_allocation = longwords * sizeof (julong) * 3;
3496   julong *scratch = (julong *)alloca(total_allocation);
3497 
3498   // Local scratch arrays
3499   julong
3500     *a = scratch + 0 * longwords,
3501     *n = scratch + 1 * longwords,
3502     *m = scratch + 2 * longwords;
3503 
3504   reverse_words((julong *)a_ints, a, longwords);
3505   reverse_words((julong *)n_ints, n, longwords);
3506 
3507   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3508     ::montgomery_square(a, n, m, (julong)inv, longwords);
3509   } else {
3510     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3511   }
3512 
3513   reverse_words(m, (julong *)m_ints, longwords);
3514 }
3515 
3516 #if INCLUDE_JFR
3517 
3518 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3519 // It returns a jobject handle to the event writer.
3520 // The handle is dereferenced and the return value is the event writer oop.
3521 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3522   enum layout {
3523     rbp_off,
3524     rbpH_off,
3525     return_off,
3526     return_off2,
3527     framesize // inclusive of return address
3528   };
3529 
3530   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3531   CodeBuffer code(name, 1024, 64);
3532   MacroAssembler* masm = new MacroAssembler(&code);
3533   address start = __ pc();
3534 
3535   __ enter();
3536   address the_pc = __ pc();
3537 
3538   int frame_complete = the_pc - start;
3539 
3540   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3541   __ movptr(c_rarg0, r15_thread);
3542   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3543   __ reset_last_Java_frame(true);
3544 
3545   // rax is jobject handle result, unpack and process it through a barrier.
3546   __ resolve_global_jobject(rax, c_rarg0);
3547 
3548   __ leave();
3549   __ ret(0);
3550 
3551   OopMapSet* oop_maps = new OopMapSet();
3552   OopMap* map = new OopMap(framesize, 1);
3553   oop_maps->add_gc_map(frame_complete, map);
3554 
3555   RuntimeStub* stub =
3556     RuntimeStub::new_runtime_stub(name,
3557                                   &code,
3558                                   frame_complete,
3559                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3560                                   oop_maps,
3561                                   false);
3562   return stub;
3563 }
3564 
3565 // For c2: call to return a leased buffer.
3566 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3567   enum layout {
3568     rbp_off,
3569     rbpH_off,
3570     return_off,
3571     return_off2,
3572     framesize // inclusive of return address
3573   };
3574 
3575   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
3576   CodeBuffer code(name, 1024, 64);
3577   MacroAssembler* masm = new MacroAssembler(&code);
3578   address start = __ pc();
3579 
3580   __ enter();
3581   address the_pc = __ pc();
3582 
3583   int frame_complete = the_pc - start;
3584 
3585   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3586   __ movptr(c_rarg0, r15_thread);
3587   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3588   __ reset_last_Java_frame(true);
3589 
3590   __ leave();
3591   __ ret(0);
3592 
3593   OopMapSet* oop_maps = new OopMapSet();
3594   OopMap* map = new OopMap(framesize, 1);
3595   oop_maps->add_gc_map(frame_complete, map);
3596 
3597   RuntimeStub* stub =
3598     RuntimeStub::new_runtime_stub(name,
3599                                   &code,
3600                                   frame_complete,
3601                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3602                                   oop_maps,
3603                                   false);
3604   return stub;
3605 }
3606 
3607 #endif // INCLUDE_JFR
3608 
3609 RuntimeStub* SharedRuntime::generate_gc_slow_call_blob(StubId stub_id, address stub_addr, bool has_return, bool save_registers, bool save_vectors) {
3610   const char* name = SharedRuntime::stub_name(stub_id);
3611 
3612   CodeBuffer code(name, 2048, 64);
3613   MacroAssembler* masm = new MacroAssembler(&code);
3614   address start = __ pc();
3615 
3616   int frame_size_in_words = 0;
3617   OopMap* map;
3618   if (save_registers) {
3619     map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
3620   } else {
3621     map = new OopMap(frame_size_in_words, 0); // FIXME: Correct?
3622   }
3623   address frame_complete_pc = __ pc();
3624 
3625   address post_call_pc;
3626 
3627   // Call the runtime. This is what MacroAssember::call_VM_leaf does,
3628   // but we also want to have exact post-call PC for oop map location.
3629   {
3630     Label L_stack_aligned, L_end;
3631 
3632     #ifdef _WIN64
3633       // Windows always allocates space for it's register args
3634       __ subptr(rsp, frame::arg_reg_save_area_bytes);
3635     #endif
3636 
3637     __ testptr(rsp, 15);
3638     __ jccb(Assembler::zero, L_stack_aligned);
3639       __ subptr(rsp, 8);
3640       __ call(RuntimeAddress(stub_addr));
3641       post_call_pc = __ pc();
3642       __ addptr(rsp, 8);
3643       __ jmpb(L_end);
3644     __ bind(L_stack_aligned);
3645       __ call(RuntimeAddress(stub_addr));
3646       post_call_pc = __ pc();
3647     __ bind(L_end);
3648 
3649     #ifdef _WIN64
3650       __ addptr(rsp, frame::arg_reg_save_area_bytes);
3651     #endif
3652   }
3653 
3654   if (save_registers && has_return) {
3655     // RegisterSaver would clobber the call result when restoring.
3656     // Carry the result out of this stub by overwriting saved register.
3657     __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3658   }
3659 
3660   OopMapSet* oop_maps = new OopMapSet();
3661   oop_maps->add_gc_map(post_call_pc - start, map);
3662 
3663   if (save_registers) {
3664     RegisterSaver::restore_live_registers(masm, save_vectors);
3665   }
3666   __ ret(0);
3667 
3668   return RuntimeStub::new_runtime_stub(name,
3669                                        &code,
3670                                        frame_complete_pc - start,
3671                                        frame_size_in_words,
3672                                        oop_maps,
3673                                        true);
3674 }