1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/vframeArray.hpp"
  56 #include "runtime/vm_version.hpp"
  57 #include "utilities/align.hpp"
  58 #include "utilities/checkedCast.hpp"
  59 #include "utilities/formatBuffer.hpp"
  60 #include "vmreg_x86.inline.hpp"
  61 #ifdef COMPILER1
  62 #include "c1/c1_Runtime1.hpp"
  63 #endif
  64 #ifdef COMPILER2
  65 #include "opto/runtime.hpp"
  66 #endif
  67 #if INCLUDE_JVMCI
  68 #include "jvmci/jvmciJavaClasses.hpp"
  69 #endif
  70 
  71 #define __ masm->
  72 
  73 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  74 
  75 class SimpleRuntimeFrame {
  76 
  77   public:
  78 
  79   // Most of the runtime stubs have this simple frame layout.
  80   // This class exists to make the layout shared in one place.
  81   // Offsets are for compiler stack slots, which are jints.
  82   enum layout {
  83     // The frame sender code expects that rbp will be in the "natural" place and
  84     // will override any oopMap setting for it. We must therefore force the layout
  85     // so that it agrees with the frame sender code.
  86     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  87     rbp_off2,
  88     return_off, return_off2,
  89     framesize
  90   };
  91 };
  92 
  93 class RegisterSaver {
  94   // Capture info about frame layout.  Layout offsets are in jint
  95   // units because compiler frame slots are jints.
  96 #define XSAVE_AREA_BEGIN 160
  97 #define XSAVE_AREA_YMM_BEGIN 576
  98 #define XSAVE_AREA_EGPRS 960
  99 #define XSAVE_AREA_OPMASK_BEGIN 1088
 100 #define XSAVE_AREA_ZMM_BEGIN 1152
 101 #define XSAVE_AREA_UPPERBANK 1664
 102 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 103 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 104 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 105 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 107   enum layout {
 108     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
 109     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
 110     DEF_XMM_OFFS(0),
 111     DEF_XMM_OFFS(1),
 112     // 2..15 are implied in range usage
 113     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 114     DEF_YMM_OFFS(0),
 115     DEF_YMM_OFFS(1),
 116     // 2..15 are implied in range usage
 117     r31_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 118     r31H_off,
 119     r30_off, r30H_off,
 120     r29_off, r29H_off,
 121     r28_off, r28H_off,
 122     r27_off, r27H_off,
 123     r26_off, r26H_off,
 124     r25_off, r25H_off,
 125     r24_off, r24H_off,
 126     r23_off, r23H_off,
 127     r22_off, r22H_off,
 128     r21_off, r21H_off,
 129     r20_off, r20H_off,
 130     r19_off, r19H_off,
 131     r18_off, r18H_off,
 132     r17_off, r17H_off,
 133     r16_off, r16H_off,
 134     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 135     DEF_OPMASK_OFFS(0),
 136     DEF_OPMASK_OFFS(1),
 137     // 2..7 are implied in range usage
 138     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 139     DEF_ZMM_OFFS(0),
 140     DEF_ZMM_OFFS(1),
 141     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 142     DEF_ZMM_UPPER_OFFS(16),
 143     DEF_ZMM_UPPER_OFFS(17),
 144     // 18..31 are implied in range usage
 145     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 146     fpu_stateH_end,
 147     r15_off, r15H_off,
 148     r14_off, r14H_off,
 149     r13_off, r13H_off,
 150     r12_off, r12H_off,
 151     r11_off, r11H_off,
 152     r10_off, r10H_off,
 153     r9_off,  r9H_off,
 154     r8_off,  r8H_off,
 155     rdi_off, rdiH_off,
 156     rsi_off, rsiH_off,
 157     ignore_off, ignoreH_off,  // extra copy of rbp
 158     rsp_off, rspH_off,
 159     rbx_off, rbxH_off,
 160     rdx_off, rdxH_off,
 161     rcx_off, rcxH_off,
 162     rax_off, raxH_off,
 163     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 164     align_off, alignH_off,
 165     flags_off, flagsH_off,
 166     // The frame sender code expects that rbp will be in the "natural" place and
 167     // will override any oopMap setting for it. We must therefore force the layout
 168     // so that it agrees with the frame sender code.
 169     rbp_off, rbpH_off,        // copy of rbp we will restore
 170     return_off, returnH_off,  // slot for return address
 171     reg_save_size             // size in compiler stack slots
 172   };
 173 
 174  public:
 175   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 176   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 177 
 178   // Offsets into the register save area
 179   // Used by deoptimization when it is managing result register
 180   // values on its own
 181 
 182   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 183   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 184   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 185   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 186   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 187   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 188 
 189   // During deoptimization only the result registers need to be restored,
 190   // all the other values have already been extracted.
 191   static void restore_result_registers(MacroAssembler* masm);
 192 };
 193 
 194 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 195   int off = 0;
 196   int num_xmm_regs = XMMRegister::available_xmm_registers();
 197 #if COMPILER2_OR_JVMCI
 198   if (save_wide_vectors && UseAVX == 0) {
 199     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 200   }
 201   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 202 #else
 203   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 204 #endif
 205 
 206   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 207   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 208   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 209   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 210   // CodeBlob frame size is in words.
 211   int frame_size_in_words = frame_size_in_bytes / wordSize;
 212   *total_frame_words = frame_size_in_words;
 213 
 214   // Save registers, fpu state, and flags.
 215   // We assume caller has already pushed the return address onto the
 216   // stack, so rsp is 8-byte aligned here.
 217   // We push rpb twice in this sequence because we want the real rbp
 218   // to be under the return like a normal enter.
 219 
 220   __ enter();          // rsp becomes 16-byte aligned here
 221   __ pushf();
 222   // Make sure rsp stays 16-byte aligned
 223   __ subq(rsp, 8);
 224   // Push CPU state in multiple of 16 bytes
 225   __ save_legacy_gprs();
 226   __ push_FPU_state();
 227 
 228 
 229   // push cpu state handles this on EVEX enabled targets
 230   if (save_wide_vectors) {
 231     // Save upper half of YMM registers(0..15)
 232     int base_addr = XSAVE_AREA_YMM_BEGIN;
 233     for (int n = 0; n < 16; n++) {
 234       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 235     }
 236     if (VM_Version::supports_evex()) {
 237       // Save upper half of ZMM registers(0..15)
 238       base_addr = XSAVE_AREA_ZMM_BEGIN;
 239       for (int n = 0; n < 16; n++) {
 240         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 241       }
 242       // Save full ZMM registers(16..num_xmm_regs)
 243       base_addr = XSAVE_AREA_UPPERBANK;
 244       off = 0;
 245       int vector_len = Assembler::AVX_512bit;
 246       for (int n = 16; n < num_xmm_regs; n++) {
 247         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 248       }
 249 #if COMPILER2_OR_JVMCI
 250       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 251       off = 0;
 252       for(int n = 0; n < KRegister::number_of_registers; n++) {
 253         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 254       }
 255 #endif
 256     }
 257   } else {
 258     if (VM_Version::supports_evex()) {
 259       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 260       int base_addr = XSAVE_AREA_UPPERBANK;
 261       off = 0;
 262       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 263       for (int n = 16; n < num_xmm_regs; n++) {
 264         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 265       }
 266 #if COMPILER2_OR_JVMCI
 267       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 268       off = 0;
 269       for(int n = 0; n < KRegister::number_of_registers; n++) {
 270         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 271       }
 272 #endif
 273     }
 274   }
 275 
 276 #if COMPILER2_OR_JVMCI
 277   if (UseAPX) {
 278       int base_addr = XSAVE_AREA_EGPRS;
 279       off = 0;
 280       for(int n = 16; n < Register::number_of_registers; n++) {
 281         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 282       }
 283   }
 284 #endif
 285 
 286   __ vzeroupper();
 287   if (frame::arg_reg_save_area_bytes != 0) {
 288     // Allocate argument register save area
 289     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 290   }
 291 
 292   // Set an oopmap for the call site.  This oopmap will map all
 293   // oop-registers and debug-info registers as callee-saved.  This
 294   // will allow deoptimization at this safepoint to find all possible
 295   // debug-info recordings, as well as let GC find all oops.
 296 
 297   OopMapSet *oop_maps = new OopMapSet();
 298   OopMap* map = new OopMap(frame_size_in_slots, 0);
 299 
 300 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 301 
 302   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 306   // rbp location is known implicitly by the frame sender code, needs no oopmap
 307   // and the location where rbp was saved by is ignored
 308   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 309   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 310   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 311   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 312   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 313   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 314   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 315   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 316   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 317   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 318 
 319   if (UseAPX) {
 320     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 324     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 325     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 326     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 327     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 328     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 329     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 330     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 331     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 332     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 333     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 334     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 335     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 336   }
 337   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 338   // on EVEX enabled targets, we get it included in the xsave area
 339   off = xmm0_off;
 340   int delta = xmm1_off - off;
 341   for (int n = 0; n < 16; n++) {
 342     XMMRegister xmm_name = as_XMMRegister(n);
 343     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 344     off += delta;
 345   }
 346   if (UseAVX > 2) {
 347     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 348     off = zmm16_off;
 349     delta = zmm17_off - off;
 350     for (int n = 16; n < num_xmm_regs; n++) {
 351       XMMRegister zmm_name = as_XMMRegister(n);
 352       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 353       off += delta;
 354     }
 355   }
 356 
 357 #if COMPILER2_OR_JVMCI
 358   if (save_wide_vectors) {
 359     // Save upper half of YMM registers(0..15)
 360     off = ymm0_off;
 361     delta = ymm1_off - ymm0_off;
 362     for (int n = 0; n < 16; n++) {
 363       XMMRegister ymm_name = as_XMMRegister(n);
 364       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 365       off += delta;
 366     }
 367     if (VM_Version::supports_evex()) {
 368       // Save upper half of ZMM registers(0..15)
 369       off = zmm0_off;
 370       delta = zmm1_off - zmm0_off;
 371       for (int n = 0; n < 16; n++) {
 372         XMMRegister zmm_name = as_XMMRegister(n);
 373         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 374         off += delta;
 375       }
 376     }
 377   }
 378 #endif // COMPILER2_OR_JVMCI
 379 
 380   // %%% These should all be a waste but we'll keep things as they were for now
 381   if (true) {
 382     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 385     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 386     // rbp location is known implicitly by the frame sender code, needs no oopmap
 387     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 388     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 389     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 390     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 391     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 392     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 393     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 394     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 395     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 396     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 397     if (UseAPX) {
 398       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 402       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 403       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 404       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 405       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 406       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 407       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 408       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 409       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 410       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 411       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 412       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 413       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 414     }
 415     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 416     // on EVEX enabled targets, we get it included in the xsave area
 417     off = xmm0H_off;
 418     delta = xmm1H_off - off;
 419     for (int n = 0; n < 16; n++) {
 420       XMMRegister xmm_name = as_XMMRegister(n);
 421       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 422       off += delta;
 423     }
 424     if (UseAVX > 2) {
 425       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 426       off = zmm16H_off;
 427       delta = zmm17H_off - off;
 428       for (int n = 16; n < num_xmm_regs; n++) {
 429         XMMRegister zmm_name = as_XMMRegister(n);
 430         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 431         off += delta;
 432       }
 433     }
 434   }
 435 
 436   return map;
 437 }
 438 
 439 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 440   int num_xmm_regs = XMMRegister::available_xmm_registers();
 441   if (frame::arg_reg_save_area_bytes != 0) {
 442     // Pop arg register save area
 443     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 444   }
 445 
 446 #if COMPILER2_OR_JVMCI
 447   if (restore_wide_vectors) {
 448     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 449     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 450   }
 451 #else
 452   assert(!restore_wide_vectors, "vectors are generated only by C2");
 453 #endif
 454 
 455   __ vzeroupper();
 456 
 457   // On EVEX enabled targets everything is handled in pop fpu state
 458   if (restore_wide_vectors) {
 459     // Restore upper half of YMM registers (0..15)
 460     int base_addr = XSAVE_AREA_YMM_BEGIN;
 461     for (int n = 0; n < 16; n++) {
 462       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 463     }
 464     if (VM_Version::supports_evex()) {
 465       // Restore upper half of ZMM registers (0..15)
 466       base_addr = XSAVE_AREA_ZMM_BEGIN;
 467       for (int n = 0; n < 16; n++) {
 468         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 469       }
 470       // Restore full ZMM registers(16..num_xmm_regs)
 471       base_addr = XSAVE_AREA_UPPERBANK;
 472       int vector_len = Assembler::AVX_512bit;
 473       int off = 0;
 474       for (int n = 16; n < num_xmm_regs; n++) {
 475         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 476       }
 477 #if COMPILER2_OR_JVMCI
 478       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 479       off = 0;
 480       for (int n = 0; n < KRegister::number_of_registers; n++) {
 481         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 482       }
 483 #endif
 484     }
 485   } else {
 486     if (VM_Version::supports_evex()) {
 487       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 488       int base_addr = XSAVE_AREA_UPPERBANK;
 489       int off = 0;
 490       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 491       for (int n = 16; n < num_xmm_regs; n++) {
 492         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 493       }
 494 #if COMPILER2_OR_JVMCI
 495       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 496       off = 0;
 497       for (int n = 0; n < KRegister::number_of_registers; n++) {
 498         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 499       }
 500 #endif
 501     }
 502   }
 503 
 504 #if COMPILER2_OR_JVMCI
 505   if (UseAPX) {
 506     int base_addr = XSAVE_AREA_EGPRS;
 507     int off = 0;
 508     for (int n = 16; n < Register::number_of_registers; n++) {
 509       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 510     }
 511   }
 512 #endif
 513 
 514   // Recover CPU state
 515   __ pop_FPU_state();
 516   __ restore_legacy_gprs();
 517   __ addq(rsp, 8);
 518   __ popf();
 519   // Get the rbp described implicitly by the calling convention (no oopMap)
 520   __ pop(rbp);
 521 }
 522 
 523 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 524 
 525   // Just restore result register. Only used by deoptimization. By
 526   // now any callee save register that needs to be restored to a c2
 527   // caller of the deoptee has been extracted into the vframeArray
 528   // and will be stuffed into the c2i adapter we create for later
 529   // restoration so only result registers need to be restored here.
 530 
 531   // Restore fp result register
 532   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 533   // Restore integer result register
 534   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 535   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 536 
 537   // Pop all of the register save are off the stack except the return address
 538   __ addptr(rsp, return_offset_in_bytes());
 539 }
 540 
 541 // Is vector's size (in bytes) bigger than a size saved by default?
 542 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 543 bool SharedRuntime::is_wide_vector(int size) {
 544   return size > 16;
 545 }
 546 
 547 // ---------------------------------------------------------------------------
 548 // Read the array of BasicTypes from a signature, and compute where the
 549 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 550 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 551 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 552 // as framesizes are fixed.
 553 // VMRegImpl::stack0 refers to the first slot 0(sp).
 554 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 555 // Register up to Register::number_of_registers are the 64-bit
 556 // integer registers.
 557 
 558 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 559 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 560 // units regardless of build. Of course for i486 there is no 64 bit build
 561 
 562 // The Java calling convention is a "shifted" version of the C ABI.
 563 // By skipping the first C ABI register we can call non-static jni methods
 564 // with small numbers of arguments without having to shuffle the arguments
 565 // at all. Since we control the java ABI we ought to at least get some
 566 // advantage out of it.
 567 
 568 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 569                                            VMRegPair *regs,
 570                                            int total_args_passed) {
 571 
 572   // Create the mapping between argument positions and
 573   // registers.
 574   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 575     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 576   };
 577   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 578     j_farg0, j_farg1, j_farg2, j_farg3,
 579     j_farg4, j_farg5, j_farg6, j_farg7
 580   };
 581 
 582 
 583   uint int_args = 0;
 584   uint fp_args = 0;
 585   uint stk_args = 0;
 586 
 587   for (int i = 0; i < total_args_passed; i++) {
 588     switch (sig_bt[i]) {
 589     case T_BOOLEAN:
 590     case T_CHAR:
 591     case T_BYTE:
 592     case T_SHORT:
 593     case T_INT:
 594       if (int_args < Argument::n_int_register_parameters_j) {
 595         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 596       } else {
 597         stk_args = align_up(stk_args, 2);
 598         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 599         stk_args += 1;
 600       }
 601       break;
 602     case T_VOID:
 603       // halves of T_LONG or T_DOUBLE
 604       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 605       regs[i].set_bad();
 606       break;
 607     case T_LONG:
 608       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 609       // fall through
 610     case T_OBJECT:
 611     case T_ARRAY:
 612     case T_ADDRESS:
 613       if (int_args < Argument::n_int_register_parameters_j) {
 614         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 615       } else {
 616         stk_args = align_up(stk_args, 2);
 617         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 618         stk_args += 2;
 619       }
 620       break;
 621     case T_FLOAT:
 622       if (fp_args < Argument::n_float_register_parameters_j) {
 623         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 624       } else {
 625         stk_args = align_up(stk_args, 2);
 626         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 627         stk_args += 1;
 628       }
 629       break;
 630     case T_DOUBLE:
 631       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 632       if (fp_args < Argument::n_float_register_parameters_j) {
 633         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 634       } else {
 635         stk_args = align_up(stk_args, 2);
 636         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 637         stk_args += 2;
 638       }
 639       break;
 640     default:
 641       ShouldNotReachHere();
 642       break;
 643     }
 644   }
 645 
 646   return stk_args;
 647 }
 648 
 649 // Patch the callers callsite with entry to compiled code if it exists.
 650 static void patch_callers_callsite(MacroAssembler *masm) {
 651   Label L;
 652   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 653   __ jcc(Assembler::equal, L);
 654 
 655   // Save the current stack pointer
 656   __ mov(r13, rsp);
 657   // Schedule the branch target address early.
 658   // Call into the VM to patch the caller, then jump to compiled callee
 659   // rax isn't live so capture return address while we easily can
 660   __ movptr(rax, Address(rsp, 0));
 661 
 662   // align stack so push_CPU_state doesn't fault
 663   __ andptr(rsp, -(StackAlignmentInBytes));
 664   __ push_CPU_state();
 665   __ vzeroupper();
 666   // VM needs caller's callsite
 667   // VM needs target method
 668   // This needs to be a long call since we will relocate this adapter to
 669   // the codeBuffer and it may not reach
 670 
 671   // Allocate argument register save area
 672   if (frame::arg_reg_save_area_bytes != 0) {
 673     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 674   }
 675   __ mov(c_rarg0, rbx);
 676   __ mov(c_rarg1, rax);
 677   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 678 
 679   // De-allocate argument register save area
 680   if (frame::arg_reg_save_area_bytes != 0) {
 681     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 682   }
 683 
 684   __ vzeroupper();
 685   __ pop_CPU_state();
 686   // restore sp
 687   __ mov(rsp, r13);
 688   __ bind(L);
 689 }
 690 
 691 
 692 static void gen_c2i_adapter(MacroAssembler *masm,
 693                             int total_args_passed,
 694                             int comp_args_on_stack,
 695                             const BasicType *sig_bt,
 696                             const VMRegPair *regs,
 697                             Label& skip_fixup) {
 698   // Before we get into the guts of the C2I adapter, see if we should be here
 699   // at all.  We've come from compiled code and are attempting to jump to the
 700   // interpreter, which means the caller made a static call to get here
 701   // (vcalls always get a compiled target if there is one).  Check for a
 702   // compiled target.  If there is one, we need to patch the caller's call.
 703   patch_callers_callsite(masm);
 704 
 705   __ bind(skip_fixup);
 706 
 707   // Since all args are passed on the stack, total_args_passed *
 708   // Interpreter::stackElementSize is the space we need.
 709 
 710   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 711 
 712   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 713 
 714   // stack is aligned, keep it that way
 715   // This is not currently needed or enforced by the interpreter, but
 716   // we might as well conform to the ABI.
 717   extraspace = align_up(extraspace, 2*wordSize);
 718 
 719   // set senderSP value
 720   __ lea(r13, Address(rsp, wordSize));
 721 
 722 #ifdef ASSERT
 723   __ check_stack_alignment(r13, "sender stack not aligned");
 724 #endif
 725   if (extraspace > 0) {
 726     // Pop the return address
 727     __ pop(rax);
 728 
 729     __ subptr(rsp, extraspace);
 730 
 731     // Push the return address
 732     __ push(rax);
 733 
 734     // Account for the return address location since we store it first rather
 735     // than hold it in a register across all the shuffling
 736     extraspace += wordSize;
 737   }
 738 
 739 #ifdef ASSERT
 740   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 741 #endif
 742 
 743   // Now write the args into the outgoing interpreter space
 744   for (int i = 0; i < total_args_passed; i++) {
 745     if (sig_bt[i] == T_VOID) {
 746       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 747       continue;
 748     }
 749 
 750     // offset to start parameters
 751     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 752     int next_off = st_off - Interpreter::stackElementSize;
 753 
 754     // Say 4 args:
 755     // i   st_off
 756     // 0   32 T_LONG
 757     // 1   24 T_VOID
 758     // 2   16 T_OBJECT
 759     // 3    8 T_BOOL
 760     // -    0 return address
 761     //
 762     // However to make thing extra confusing. Because we can fit a long/double in
 763     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 764     // leaves one slot empty and only stores to a single slot. In this case the
 765     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 766 
 767     VMReg r_1 = regs[i].first();
 768     VMReg r_2 = regs[i].second();
 769     if (!r_1->is_valid()) {
 770       assert(!r_2->is_valid(), "");
 771       continue;
 772     }
 773     if (r_1->is_stack()) {
 774       // memory to memory use rax
 775       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 776       if (!r_2->is_valid()) {
 777         // sign extend??
 778         __ movl(rax, Address(rsp, ld_off));
 779         __ movptr(Address(rsp, st_off), rax);
 780 
 781       } else {
 782 
 783         __ movq(rax, Address(rsp, ld_off));
 784 
 785         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 786         // T_DOUBLE and T_LONG use two slots in the interpreter
 787         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 788           // ld_off == LSW, ld_off+wordSize == MSW
 789           // st_off == MSW, next_off == LSW
 790           __ movq(Address(rsp, next_off), rax);
 791 #ifdef ASSERT
 792           // Overwrite the unused slot with known junk
 793           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 794           __ movptr(Address(rsp, st_off), rax);
 795 #endif /* ASSERT */
 796         } else {
 797           __ movq(Address(rsp, st_off), rax);
 798         }
 799       }
 800     } else if (r_1->is_Register()) {
 801       Register r = r_1->as_Register();
 802       if (!r_2->is_valid()) {
 803         // must be only an int (or less ) so move only 32bits to slot
 804         // why not sign extend??
 805         __ movl(Address(rsp, st_off), r);
 806       } else {
 807         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 808         // T_DOUBLE and T_LONG use two slots in the interpreter
 809         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 810           // long/double in gpr
 811 #ifdef ASSERT
 812           // Overwrite the unused slot with known junk
 813           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 814           __ movptr(Address(rsp, st_off), rax);
 815 #endif /* ASSERT */
 816           __ movq(Address(rsp, next_off), r);
 817         } else {
 818           __ movptr(Address(rsp, st_off), r);
 819         }
 820       }
 821     } else {
 822       assert(r_1->is_XMMRegister(), "");
 823       if (!r_2->is_valid()) {
 824         // only a float use just part of the slot
 825         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 826       } else {
 827 #ifdef ASSERT
 828         // Overwrite the unused slot with known junk
 829         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 830         __ movptr(Address(rsp, st_off), rax);
 831 #endif /* ASSERT */
 832         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 833       }
 834     }
 835   }
 836 
 837   // Schedule the branch target address early.
 838   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 839   __ jmp(rcx);
 840 }
 841 
 842 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 843                         address code_start, address code_end,
 844                         Label& L_ok) {
 845   Label L_fail;
 846   __ lea(temp_reg, ExternalAddress(code_start));
 847   __ cmpptr(pc_reg, temp_reg);
 848   __ jcc(Assembler::belowEqual, L_fail);
 849   __ lea(temp_reg, ExternalAddress(code_end));
 850   __ cmpptr(pc_reg, temp_reg);
 851   __ jcc(Assembler::below, L_ok);
 852   __ bind(L_fail);
 853 }
 854 
 855 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 856                                     int total_args_passed,
 857                                     int comp_args_on_stack,
 858                                     const BasicType *sig_bt,
 859                                     const VMRegPair *regs) {
 860 
 861   // Note: r13 contains the senderSP on entry. We must preserve it since
 862   // we may do a i2c -> c2i transition if we lose a race where compiled
 863   // code goes non-entrant while we get args ready.
 864   // In addition we use r13 to locate all the interpreter args as
 865   // we must align the stack to 16 bytes on an i2c entry else we
 866   // lose alignment we expect in all compiled code and register
 867   // save code can segv when fxsave instructions find improperly
 868   // aligned stack pointer.
 869 
 870   // Adapters can be frameless because they do not require the caller
 871   // to perform additional cleanup work, such as correcting the stack pointer.
 872   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 873   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 874   // even if a callee has modified the stack pointer.
 875   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 876   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 877   // up via the senderSP register).
 878   // In other words, if *either* the caller or callee is interpreted, we can
 879   // get the stack pointer repaired after a call.
 880   // This is why c2i and i2c adapters cannot be indefinitely composed.
 881   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 882   // both caller and callee would be compiled methods, and neither would
 883   // clean up the stack pointer changes performed by the two adapters.
 884   // If this happens, control eventually transfers back to the compiled
 885   // caller, but with an uncorrected stack, causing delayed havoc.
 886 
 887   if (VerifyAdapterCalls &&
 888       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 889     // So, let's test for cascading c2i/i2c adapters right now.
 890     //  assert(Interpreter::contains($return_addr) ||
 891     //         StubRoutines::contains($return_addr),
 892     //         "i2c adapter must return to an interpreter frame");
 893     __ block_comment("verify_i2c { ");
 894     // Pick up the return address
 895     __ movptr(rax, Address(rsp, 0));
 896     Label L_ok;
 897     if (Interpreter::code() != nullptr) {
 898       range_check(masm, rax, r11,
 899                   Interpreter::code()->code_start(),
 900                   Interpreter::code()->code_end(),
 901                   L_ok);
 902     }
 903     if (StubRoutines::initial_stubs_code() != nullptr) {
 904       range_check(masm, rax, r11,
 905                   StubRoutines::initial_stubs_code()->code_begin(),
 906                   StubRoutines::initial_stubs_code()->code_end(),
 907                   L_ok);
 908     }
 909     if (StubRoutines::final_stubs_code() != nullptr) {
 910       range_check(masm, rax, r11,
 911                   StubRoutines::final_stubs_code()->code_begin(),
 912                   StubRoutines::final_stubs_code()->code_end(),
 913                   L_ok);
 914     }
 915     const char* msg = "i2c adapter must return to an interpreter frame";
 916     __ block_comment(msg);
 917     __ stop(msg);
 918     __ bind(L_ok);
 919     __ block_comment("} verify_i2ce ");
 920   }
 921 
 922   // Must preserve original SP for loading incoming arguments because
 923   // we need to align the outgoing SP for compiled code.
 924   __ movptr(r11, rsp);
 925 
 926   // Pick up the return address
 927   __ pop(rax);
 928 
 929   // Convert 4-byte c2 stack slots to words.
 930   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 931 
 932   if (comp_args_on_stack) {
 933     __ subptr(rsp, comp_words_on_stack * wordSize);
 934   }
 935 
 936   // Ensure compiled code always sees stack at proper alignment
 937   __ andptr(rsp, -16);
 938 
 939   // push the return address and misalign the stack that youngest frame always sees
 940   // as far as the placement of the call instruction
 941   __ push(rax);
 942 
 943   // Put saved SP in another register
 944   const Register saved_sp = rax;
 945   __ movptr(saved_sp, r11);
 946 
 947   // Will jump to the compiled code just as if compiled code was doing it.
 948   // Pre-load the register-jump target early, to schedule it better.
 949   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 950 
 951 #if INCLUDE_JVMCI
 952   if (EnableJVMCI) {
 953     // check if this call should be routed towards a specific entry point
 954     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 955     Label no_alternative_target;
 956     __ jcc(Assembler::equal, no_alternative_target);
 957     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 958     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 959     __ bind(no_alternative_target);
 960   }
 961 #endif // INCLUDE_JVMCI
 962 
 963   // Now generate the shuffle code.  Pick up all register args and move the
 964   // rest through the floating point stack top.
 965   for (int i = 0; i < total_args_passed; i++) {
 966     if (sig_bt[i] == T_VOID) {
 967       // Longs and doubles are passed in native word order, but misaligned
 968       // in the 32-bit build.
 969       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 970       continue;
 971     }
 972 
 973     // Pick up 0, 1 or 2 words from SP+offset.
 974 
 975     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 976             "scrambled load targets?");
 977     // Load in argument order going down.
 978     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 979     // Point to interpreter value (vs. tag)
 980     int next_off = ld_off - Interpreter::stackElementSize;
 981     //
 982     //
 983     //
 984     VMReg r_1 = regs[i].first();
 985     VMReg r_2 = regs[i].second();
 986     if (!r_1->is_valid()) {
 987       assert(!r_2->is_valid(), "");
 988       continue;
 989     }
 990     if (r_1->is_stack()) {
 991       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 992       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 993 
 994       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 995       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 996       // will be generated.
 997       if (!r_2->is_valid()) {
 998         // sign extend???
 999         __ movl(r13, Address(saved_sp, ld_off));
1000         __ movptr(Address(rsp, st_off), r13);
1001       } else {
1002         //
1003         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1004         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1005         // So we must adjust where to pick up the data to match the interpreter.
1006         //
1007         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1008         // are accessed as negative so LSW is at LOW address
1009 
1010         // ld_off is MSW so get LSW
1011         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
1012                            next_off : ld_off;
1013         __ movq(r13, Address(saved_sp, offset));
1014         // st_off is LSW (i.e. reg.first())
1015         __ movq(Address(rsp, st_off), r13);
1016       }
1017     } else if (r_1->is_Register()) {  // Register argument
1018       Register r = r_1->as_Register();
1019       assert(r != rax, "must be different");
1020       if (r_2->is_valid()) {
1021         //
1022         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1023         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1024         // So we must adjust where to pick up the data to match the interpreter.
1025 
1026         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
1027                            next_off : ld_off;
1028 
1029         // this can be a misaligned move
1030         __ movq(r, Address(saved_sp, offset));
1031       } else {
1032         // sign extend and use a full word?
1033         __ movl(r, Address(saved_sp, ld_off));
1034       }
1035     } else {
1036       if (!r_2->is_valid()) {
1037         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1038       } else {
1039         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1040       }
1041     }
1042   }
1043 
1044   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1045 
1046   // 6243940 We might end up in handle_wrong_method if
1047   // the callee is deoptimized as we race thru here. If that
1048   // happens we don't want to take a safepoint because the
1049   // caller frame will look interpreted and arguments are now
1050   // "compiled" so it is much better to make this transition
1051   // invisible to the stack walking code. Unfortunately if
1052   // we try and find the callee by normal means a safepoint
1053   // is possible. So we stash the desired callee in the thread
1054   // and the vm will find there should this case occur.
1055 
1056   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1057 
1058   // put Method* where a c2i would expect should we end up there
1059   // only needed because eof c2 resolve stubs return Method* as a result in
1060   // rax
1061   __ mov(rax, rbx);
1062   __ jmp(r11);
1063 }
1064 
1065 // ---------------------------------------------------------------
1066 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1067                                                             int total_args_passed,
1068                                                             int comp_args_on_stack,
1069                                                             const BasicType *sig_bt,
1070                                                             const VMRegPair *regs,
1071                                                             AdapterFingerPrint* fingerprint) {
1072   address i2c_entry = __ pc();
1073 
1074   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1075 
1076   // -------------------------------------------------------------------------
1077   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1078   // to the interpreter.  The args start out packed in the compiled layout.  They
1079   // need to be unpacked into the interpreter layout.  This will almost always
1080   // require some stack space.  We grow the current (compiled) stack, then repack
1081   // the args.  We  finally end in a jump to the generic interpreter entry point.
1082   // On exit from the interpreter, the interpreter will restore our SP (lest the
1083   // compiled code, which relies solely on SP and not RBP, get sick).
1084 
1085   address c2i_unverified_entry = __ pc();
1086   Label skip_fixup;
1087 
1088   Register data = rax;
1089   Register receiver = j_rarg0;
1090   Register temp = rbx;
1091 
1092   {
1093     __ ic_check(1 /* end_alignment */);
1094     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1095     // Method might have been compiled since the call site was patched to
1096     // interpreted if that is the case treat it as a miss so we can get
1097     // the call site corrected.
1098     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1099     __ jcc(Assembler::equal, skip_fixup);
1100     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1101   }
1102 
1103   address c2i_entry = __ pc();
1104 
1105   // Class initialization barrier for static methods
1106   address c2i_no_clinit_check_entry = nullptr;
1107   if (VM_Version::supports_fast_class_init_checks()) {
1108     Label L_skip_barrier;
1109     Register method = rbx;
1110 
1111     { // Bypass the barrier for non-static methods
1112       Register flags = rscratch1;
1113       __ movl(flags, Address(method, Method::access_flags_offset()));
1114       __ testl(flags, JVM_ACC_STATIC);
1115       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1116     }
1117 
1118     Register klass = rscratch1;
1119     __ load_method_holder(klass, method);
1120     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1121 
1122     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1123 
1124     __ bind(L_skip_barrier);
1125     c2i_no_clinit_check_entry = __ pc();
1126   }
1127 
1128   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1129   bs->c2i_entry_barrier(masm);
1130 
1131   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1132 
1133   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1134 }
1135 
1136 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1137                                          VMRegPair *regs,
1138                                          int total_args_passed) {
1139 
1140 // We return the amount of VMRegImpl stack slots we need to reserve for all
1141 // the arguments NOT counting out_preserve_stack_slots.
1142 
1143 // NOTE: These arrays will have to change when c1 is ported
1144 #ifdef _WIN64
1145     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1146       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1147     };
1148     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1149       c_farg0, c_farg1, c_farg2, c_farg3
1150     };
1151 #else
1152     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1153       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1154     };
1155     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1156       c_farg0, c_farg1, c_farg2, c_farg3,
1157       c_farg4, c_farg5, c_farg6, c_farg7
1158     };
1159 #endif // _WIN64
1160 
1161 
1162     uint int_args = 0;
1163     uint fp_args = 0;
1164     uint stk_args = 0; // inc by 2 each time
1165 
1166     for (int i = 0; i < total_args_passed; i++) {
1167       switch (sig_bt[i]) {
1168       case T_BOOLEAN:
1169       case T_CHAR:
1170       case T_BYTE:
1171       case T_SHORT:
1172       case T_INT:
1173         if (int_args < Argument::n_int_register_parameters_c) {
1174           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1175 #ifdef _WIN64
1176           fp_args++;
1177           // Allocate slots for callee to stuff register args the stack.
1178           stk_args += 2;
1179 #endif
1180         } else {
1181           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1182           stk_args += 2;
1183         }
1184         break;
1185       case T_LONG:
1186         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1187         // fall through
1188       case T_OBJECT:
1189       case T_ARRAY:
1190       case T_ADDRESS:
1191       case T_METADATA:
1192         if (int_args < Argument::n_int_register_parameters_c) {
1193           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1194 #ifdef _WIN64
1195           fp_args++;
1196           stk_args += 2;
1197 #endif
1198         } else {
1199           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1200           stk_args += 2;
1201         }
1202         break;
1203       case T_FLOAT:
1204         if (fp_args < Argument::n_float_register_parameters_c) {
1205           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1206 #ifdef _WIN64
1207           int_args++;
1208           // Allocate slots for callee to stuff register args the stack.
1209           stk_args += 2;
1210 #endif
1211         } else {
1212           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1213           stk_args += 2;
1214         }
1215         break;
1216       case T_DOUBLE:
1217         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1218         if (fp_args < Argument::n_float_register_parameters_c) {
1219           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1220 #ifdef _WIN64
1221           int_args++;
1222           // Allocate slots for callee to stuff register args the stack.
1223           stk_args += 2;
1224 #endif
1225         } else {
1226           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1227           stk_args += 2;
1228         }
1229         break;
1230       case T_VOID: // Halves of longs and doubles
1231         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1232         regs[i].set_bad();
1233         break;
1234       default:
1235         ShouldNotReachHere();
1236         break;
1237       }
1238     }
1239 #ifdef _WIN64
1240   // windows abi requires that we always allocate enough stack space
1241   // for 4 64bit registers to be stored down.
1242   if (stk_args < 8) {
1243     stk_args = 8;
1244   }
1245 #endif // _WIN64
1246 
1247   return stk_args;
1248 }
1249 
1250 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1251                                              uint num_bits,
1252                                              uint total_args_passed) {
1253   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1254          "only certain vector sizes are supported for now");
1255 
1256   static const XMMRegister VEC_ArgReg[32] = {
1257      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1258      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1259     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1260     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1261   };
1262 
1263   uint stk_args = 0;
1264   uint fp_args = 0;
1265 
1266   for (uint i = 0; i < total_args_passed; i++) {
1267     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1268     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1269     regs[i].set_pair(vmreg->next(next_val), vmreg);
1270   }
1271 
1272   return stk_args;
1273 }
1274 
1275 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1276   // We always ignore the frame_slots arg and just use the space just below frame pointer
1277   // which by this time is free to use
1278   switch (ret_type) {
1279   case T_FLOAT:
1280     __ movflt(Address(rbp, -wordSize), xmm0);
1281     break;
1282   case T_DOUBLE:
1283     __ movdbl(Address(rbp, -wordSize), xmm0);
1284     break;
1285   case T_VOID:  break;
1286   default: {
1287     __ movptr(Address(rbp, -wordSize), rax);
1288     }
1289   }
1290 }
1291 
1292 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1293   // We always ignore the frame_slots arg and just use the space just below frame pointer
1294   // which by this time is free to use
1295   switch (ret_type) {
1296   case T_FLOAT:
1297     __ movflt(xmm0, Address(rbp, -wordSize));
1298     break;
1299   case T_DOUBLE:
1300     __ movdbl(xmm0, Address(rbp, -wordSize));
1301     break;
1302   case T_VOID:  break;
1303   default: {
1304     __ movptr(rax, Address(rbp, -wordSize));
1305     }
1306   }
1307 }
1308 
1309 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1310     for ( int i = first_arg ; i < arg_count ; i++ ) {
1311       if (args[i].first()->is_Register()) {
1312         __ push(args[i].first()->as_Register());
1313       } else if (args[i].first()->is_XMMRegister()) {
1314         __ subptr(rsp, 2*wordSize);
1315         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1316       }
1317     }
1318 }
1319 
1320 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1321     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1322       if (args[i].first()->is_Register()) {
1323         __ pop(args[i].first()->as_Register());
1324       } else if (args[i].first()->is_XMMRegister()) {
1325         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1326         __ addptr(rsp, 2*wordSize);
1327       }
1328     }
1329 }
1330 
1331 static void verify_oop_args(MacroAssembler* masm,
1332                             const methodHandle& method,
1333                             const BasicType* sig_bt,
1334                             const VMRegPair* regs) {
1335   Register temp_reg = rbx;  // not part of any compiled calling seq
1336   if (VerifyOops) {
1337     for (int i = 0; i < method->size_of_parameters(); i++) {
1338       if (is_reference_type(sig_bt[i])) {
1339         VMReg r = regs[i].first();
1340         assert(r->is_valid(), "bad oop arg");
1341         if (r->is_stack()) {
1342           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1343           __ verify_oop(temp_reg);
1344         } else {
1345           __ verify_oop(r->as_Register());
1346         }
1347       }
1348     }
1349   }
1350 }
1351 
1352 static void check_continuation_enter_argument(VMReg actual_vmreg,
1353                                               Register expected_reg,
1354                                               const char* name) {
1355   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1356   assert(actual_vmreg->as_Register() == expected_reg,
1357          "%s is in unexpected register: %s instead of %s",
1358          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1359 }
1360 
1361 
1362 //---------------------------- continuation_enter_setup ---------------------------
1363 //
1364 // Arguments:
1365 //   None.
1366 //
1367 // Results:
1368 //   rsp: pointer to blank ContinuationEntry
1369 //
1370 // Kills:
1371 //   rax
1372 //
1373 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1374   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1375   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1376   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1377 
1378   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1379   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1380 
1381   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1382   OopMap* map = new OopMap(frame_size, 0);
1383 
1384   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1385   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1386   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1387 
1388   return map;
1389 }
1390 
1391 //---------------------------- fill_continuation_entry ---------------------------
1392 //
1393 // Arguments:
1394 //   rsp: pointer to blank Continuation entry
1395 //   reg_cont_obj: pointer to the continuation
1396 //   reg_flags: flags
1397 //
1398 // Results:
1399 //   rsp: pointer to filled out ContinuationEntry
1400 //
1401 // Kills:
1402 //   rax
1403 //
1404 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1405   assert_different_registers(rax, reg_cont_obj, reg_flags);
1406 #ifdef ASSERT
1407   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1408 #endif
1409   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1410   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1411   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1412   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1413   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1414 
1415   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1416   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1417   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1418   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1419 
1420   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1421   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1422 }
1423 
1424 //---------------------------- continuation_enter_cleanup ---------------------------
1425 //
1426 // Arguments:
1427 //   rsp: pointer to the ContinuationEntry
1428 //
1429 // Results:
1430 //   rsp: pointer to the spilled rbp in the entry frame
1431 //
1432 // Kills:
1433 //   rbx
1434 //
1435 static void continuation_enter_cleanup(MacroAssembler* masm) {
1436 #ifdef ASSERT
1437   Label L_good_sp;
1438   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1439   __ jcc(Assembler::equal, L_good_sp);
1440   __ stop("Incorrect rsp at continuation_enter_cleanup");
1441   __ bind(L_good_sp);
1442 #endif
1443   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1444   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1445 
1446   if (CheckJNICalls) {
1447     // Check if this is a virtual thread continuation
1448     Label L_skip_vthread_code;
1449     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1450     __ jcc(Assembler::equal, L_skip_vthread_code);
1451 
1452     // If the held monitor count is > 0 and this vthread is terminating then
1453     // it failed to release a JNI monitor. So we issue the same log message
1454     // that JavaThread::exit does.
1455     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1456     __ jcc(Assembler::equal, L_skip_vthread_code);
1457 
1458     // rax may hold an exception oop, save it before the call
1459     __ push(rax);
1460     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1461     __ pop(rax);
1462 
1463     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1464     // on termination. The held count is implicitly zeroed below when we restore from
1465     // the parent held count (which has to be zero).
1466     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1467 
1468     __ bind(L_skip_vthread_code);
1469   }
1470 #ifdef ASSERT
1471   else {
1472     // Check if this is a virtual thread continuation
1473     Label L_skip_vthread_code;
1474     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1475     __ jcc(Assembler::equal, L_skip_vthread_code);
1476 
1477     // See comment just above. If not checking JNI calls the JNI count is only
1478     // needed for assertion checking.
1479     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1480 
1481     __ bind(L_skip_vthread_code);
1482   }
1483 #endif
1484 
1485   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1486   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1487 
1488   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1489   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1490   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1491 }
1492 
1493 static void gen_continuation_enter(MacroAssembler* masm,
1494                                    const VMRegPair* regs,
1495                                    int& exception_offset,
1496                                    OopMapSet* oop_maps,
1497                                    int& frame_complete,
1498                                    int& stack_slots,
1499                                    int& interpreted_entry_offset,
1500                                    int& compiled_entry_offset) {
1501 
1502   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1503   int pos_cont_obj   = 0;
1504   int pos_is_cont    = 1;
1505   int pos_is_virtual = 2;
1506 
1507   // The platform-specific calling convention may present the arguments in various registers.
1508   // To simplify the rest of the code, we expect the arguments to reside at these known
1509   // registers, and we additionally check the placement here in case calling convention ever
1510   // changes.
1511   Register reg_cont_obj   = c_rarg1;
1512   Register reg_is_cont    = c_rarg2;
1513   Register reg_is_virtual = c_rarg3;
1514 
1515   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1516   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1517   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1518 
1519   // Utility methods kill rax, make sure there are no collisions
1520   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1521 
1522   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1523                          relocInfo::static_call_type);
1524 
1525   address start = __ pc();
1526 
1527   Label L_thaw, L_exit;
1528 
1529   // i2i entry used at interp_only_mode only
1530   interpreted_entry_offset = __ pc() - start;
1531   {
1532 #ifdef ASSERT
1533     Label is_interp_only;
1534     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1535     __ jcc(Assembler::notEqual, is_interp_only);
1536     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1537     __ bind(is_interp_only);
1538 #endif
1539 
1540     __ pop(rax); // return address
1541     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1542     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1543     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1544     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1545     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1546     __ push(rax); // return address
1547     __ push_cont_fastpath();
1548 
1549     __ enter();
1550 
1551     stack_slots = 2; // will be adjusted in setup
1552     OopMap* map = continuation_enter_setup(masm, stack_slots);
1553     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1554     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1555 
1556     __ verify_oop(reg_cont_obj);
1557 
1558     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1559 
1560     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1561     __ testptr(reg_is_cont, reg_is_cont);
1562     __ jcc(Assembler::notZero, L_thaw);
1563 
1564     // --- Resolve path
1565 
1566     // Make sure the call is patchable
1567     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1568     // Emit stub for static call
1569     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1570     if (stub == nullptr) {
1571       fatal("CodeCache is full at gen_continuation_enter");
1572     }
1573     __ call(resolve);
1574     oop_maps->add_gc_map(__ pc() - start, map);
1575     __ post_call_nop();
1576 
1577     __ jmp(L_exit);
1578   }
1579 
1580   // compiled entry
1581   __ align(CodeEntryAlignment);
1582   compiled_entry_offset = __ pc() - start;
1583   __ enter();
1584 
1585   stack_slots = 2; // will be adjusted in setup
1586   OopMap* map = continuation_enter_setup(masm, stack_slots);
1587 
1588   // Frame is now completed as far as size and linkage.
1589   frame_complete = __ pc() - start;
1590 
1591   __ verify_oop(reg_cont_obj);
1592 
1593   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1594 
1595   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1596   __ testptr(reg_is_cont, reg_is_cont);
1597   __ jccb(Assembler::notZero, L_thaw);
1598 
1599   // --- call Continuation.enter(Continuation c, boolean isContinue)
1600 
1601   // Make sure the call is patchable
1602   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1603 
1604   // Emit stub for static call
1605   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1606   if (stub == nullptr) {
1607     fatal("CodeCache is full at gen_continuation_enter");
1608   }
1609 
1610   // The call needs to be resolved. There's a special case for this in
1611   // SharedRuntime::find_callee_info_helper() which calls
1612   // LinkResolver::resolve_continuation_enter() which resolves the call to
1613   // Continuation.enter(Continuation c, boolean isContinue).
1614   __ call(resolve);
1615 
1616   oop_maps->add_gc_map(__ pc() - start, map);
1617   __ post_call_nop();
1618 
1619   __ jmpb(L_exit);
1620 
1621   // --- Thawing path
1622 
1623   __ bind(L_thaw);
1624 
1625   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1626   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1627 
1628   ContinuationEntry::_return_pc_offset = __ pc() - start;
1629   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1630   __ post_call_nop();
1631 
1632   // --- Normal exit (resolve/thawing)
1633 
1634   __ bind(L_exit);
1635 
1636   continuation_enter_cleanup(masm);
1637   __ pop(rbp);
1638   __ ret(0);
1639 
1640   // --- Exception handling path
1641 
1642   exception_offset = __ pc() - start;
1643 
1644   continuation_enter_cleanup(masm);
1645   __ pop(rbp);
1646 
1647   __ movptr(c_rarg0, r15_thread);
1648   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1649 
1650   // rax still holds the original exception oop, save it before the call
1651   __ push(rax);
1652 
1653   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1654   __ movptr(rbx, rax);
1655 
1656   // Continue at exception handler:
1657   //   rax: exception oop
1658   //   rbx: exception handler
1659   //   rdx: exception pc
1660   __ pop(rax);
1661   __ verify_oop(rax);
1662   __ pop(rdx);
1663   __ jmp(rbx);
1664 }
1665 
1666 static void gen_continuation_yield(MacroAssembler* masm,
1667                                    const VMRegPair* regs,
1668                                    OopMapSet* oop_maps,
1669                                    int& frame_complete,
1670                                    int& stack_slots,
1671                                    int& compiled_entry_offset) {
1672   enum layout {
1673     rbp_off,
1674     rbpH_off,
1675     return_off,
1676     return_off2,
1677     framesize // inclusive of return address
1678   };
1679   stack_slots = framesize /  VMRegImpl::slots_per_word;
1680   assert(stack_slots == 2, "recheck layout");
1681 
1682   address start = __ pc();
1683   compiled_entry_offset = __ pc() - start;
1684   __ enter();
1685   address the_pc = __ pc();
1686 
1687   frame_complete = the_pc - start;
1688 
1689   // This nop must be exactly at the PC we push into the frame info.
1690   // We use this nop for fast CodeBlob lookup, associate the OopMap
1691   // with it right away.
1692   __ post_call_nop();
1693   OopMap* map = new OopMap(framesize, 1);
1694   oop_maps->add_gc_map(frame_complete, map);
1695 
1696   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1697   __ movptr(c_rarg0, r15_thread);
1698   __ movptr(c_rarg1, rsp);
1699   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1700   __ reset_last_Java_frame(true);
1701 
1702   Label L_pinned;
1703 
1704   __ testptr(rax, rax);
1705   __ jcc(Assembler::notZero, L_pinned);
1706 
1707   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1708   continuation_enter_cleanup(masm);
1709   __ pop(rbp);
1710   __ ret(0);
1711 
1712   __ bind(L_pinned);
1713 
1714   // Pinned, return to caller
1715 
1716   // handle pending exception thrown by freeze
1717   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1718   Label ok;
1719   __ jcc(Assembler::equal, ok);
1720   __ leave();
1721   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1722   __ bind(ok);
1723 
1724   __ leave();
1725   __ ret(0);
1726 }
1727 
1728 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1729   ::continuation_enter_cleanup(masm);
1730 }
1731 
1732 static void gen_special_dispatch(MacroAssembler* masm,
1733                                  const methodHandle& method,
1734                                  const BasicType* sig_bt,
1735                                  const VMRegPair* regs) {
1736   verify_oop_args(masm, method, sig_bt, regs);
1737   vmIntrinsics::ID iid = method->intrinsic_id();
1738 
1739   // Now write the args into the outgoing interpreter space
1740   bool     has_receiver   = false;
1741   Register receiver_reg   = noreg;
1742   int      member_arg_pos = -1;
1743   Register member_reg     = noreg;
1744   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1745   if (ref_kind != 0) {
1746     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1747     member_reg = rbx;  // known to be free at this point
1748     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1749   } else if (iid == vmIntrinsics::_invokeBasic) {
1750     has_receiver = true;
1751   } else if (iid == vmIntrinsics::_linkToNative) {
1752     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1753     member_reg = rbx;  // known to be free at this point
1754   } else {
1755     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1756   }
1757 
1758   if (member_reg != noreg) {
1759     // Load the member_arg into register, if necessary.
1760     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1761     VMReg r = regs[member_arg_pos].first();
1762     if (r->is_stack()) {
1763       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1764     } else {
1765       // no data motion is needed
1766       member_reg = r->as_Register();
1767     }
1768   }
1769 
1770   if (has_receiver) {
1771     // Make sure the receiver is loaded into a register.
1772     assert(method->size_of_parameters() > 0, "oob");
1773     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1774     VMReg r = regs[0].first();
1775     assert(r->is_valid(), "bad receiver arg");
1776     if (r->is_stack()) {
1777       // Porting note:  This assumes that compiled calling conventions always
1778       // pass the receiver oop in a register.  If this is not true on some
1779       // platform, pick a temp and load the receiver from stack.
1780       fatal("receiver always in a register");
1781       receiver_reg = j_rarg0;  // known to be free at this point
1782       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1783     } else {
1784       // no data motion is needed
1785       receiver_reg = r->as_Register();
1786     }
1787   }
1788 
1789   // Figure out which address we are really jumping to:
1790   MethodHandles::generate_method_handle_dispatch(masm, iid,
1791                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1792 }
1793 
1794 // ---------------------------------------------------------------------------
1795 // Generate a native wrapper for a given method.  The method takes arguments
1796 // in the Java compiled code convention, marshals them to the native
1797 // convention (handlizes oops, etc), transitions to native, makes the call,
1798 // returns to java state (possibly blocking), unhandlizes any result and
1799 // returns.
1800 //
1801 // Critical native functions are a shorthand for the use of
1802 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1803 // functions.  The wrapper is expected to unpack the arguments before
1804 // passing them to the callee. Critical native functions leave the state _in_Java,
1805 // since they cannot stop for GC.
1806 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1807 // block and the check for pending exceptions it's impossible for them
1808 // to be thrown.
1809 //
1810 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1811                                                 const methodHandle& method,
1812                                                 int compile_id,
1813                                                 BasicType* in_sig_bt,
1814                                                 VMRegPair* in_regs,
1815                                                 BasicType ret_type) {
1816   if (method->is_continuation_native_intrinsic()) {
1817     int exception_offset = -1;
1818     OopMapSet* oop_maps = new OopMapSet();
1819     int frame_complete = -1;
1820     int stack_slots = -1;
1821     int interpreted_entry_offset = -1;
1822     int vep_offset = -1;
1823     if (method->is_continuation_enter_intrinsic()) {
1824       gen_continuation_enter(masm,
1825                              in_regs,
1826                              exception_offset,
1827                              oop_maps,
1828                              frame_complete,
1829                              stack_slots,
1830                              interpreted_entry_offset,
1831                              vep_offset);
1832     } else if (method->is_continuation_yield_intrinsic()) {
1833       gen_continuation_yield(masm,
1834                              in_regs,
1835                              oop_maps,
1836                              frame_complete,
1837                              stack_slots,
1838                              vep_offset);
1839     } else {
1840       guarantee(false, "Unknown Continuation native intrinsic");
1841     }
1842 
1843 #ifdef ASSERT
1844     if (method->is_continuation_enter_intrinsic()) {
1845       assert(interpreted_entry_offset != -1, "Must be set");
1846       assert(exception_offset != -1,         "Must be set");
1847     } else {
1848       assert(interpreted_entry_offset == -1, "Must be unset");
1849       assert(exception_offset == -1,         "Must be unset");
1850     }
1851     assert(frame_complete != -1,    "Must be set");
1852     assert(stack_slots != -1,       "Must be set");
1853     assert(vep_offset != -1,        "Must be set");
1854 #endif
1855 
1856     __ flush();
1857     nmethod* nm = nmethod::new_native_nmethod(method,
1858                                               compile_id,
1859                                               masm->code(),
1860                                               vep_offset,
1861                                               frame_complete,
1862                                               stack_slots,
1863                                               in_ByteSize(-1),
1864                                               in_ByteSize(-1),
1865                                               oop_maps,
1866                                               exception_offset);
1867     if (nm == nullptr) return nm;
1868     if (method->is_continuation_enter_intrinsic()) {
1869       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1870     } else if (method->is_continuation_yield_intrinsic()) {
1871       _cont_doYield_stub = nm;
1872     }
1873     return nm;
1874   }
1875 
1876   if (method->is_method_handle_intrinsic()) {
1877     vmIntrinsics::ID iid = method->intrinsic_id();
1878     intptr_t start = (intptr_t)__ pc();
1879     int vep_offset = ((intptr_t)__ pc()) - start;
1880     gen_special_dispatch(masm,
1881                          method,
1882                          in_sig_bt,
1883                          in_regs);
1884     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1885     __ flush();
1886     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1887     return nmethod::new_native_nmethod(method,
1888                                        compile_id,
1889                                        masm->code(),
1890                                        vep_offset,
1891                                        frame_complete,
1892                                        stack_slots / VMRegImpl::slots_per_word,
1893                                        in_ByteSize(-1),
1894                                        in_ByteSize(-1),
1895                                        nullptr);
1896   }
1897   address native_func = method->native_function();
1898   assert(native_func != nullptr, "must have function");
1899 
1900   // An OopMap for lock (and class if static)
1901   OopMapSet *oop_maps = new OopMapSet();
1902   intptr_t start = (intptr_t)__ pc();
1903 
1904   // We have received a description of where all the java arg are located
1905   // on entry to the wrapper. We need to convert these args to where
1906   // the jni function will expect them. To figure out where they go
1907   // we convert the java signature to a C signature by inserting
1908   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1909 
1910   const int total_in_args = method->size_of_parameters();
1911   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1912 
1913   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1914   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1915   BasicType* in_elem_bt = nullptr;
1916 
1917   int argc = 0;
1918   out_sig_bt[argc++] = T_ADDRESS;
1919   if (method->is_static()) {
1920     out_sig_bt[argc++] = T_OBJECT;
1921   }
1922 
1923   for (int i = 0; i < total_in_args ; i++ ) {
1924     out_sig_bt[argc++] = in_sig_bt[i];
1925   }
1926 
1927   // Now figure out where the args must be stored and how much stack space
1928   // they require.
1929   int out_arg_slots;
1930   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1931 
1932   // Compute framesize for the wrapper.  We need to handlize all oops in
1933   // incoming registers
1934 
1935   // Calculate the total number of stack slots we will need.
1936 
1937   // First count the abi requirement plus all of the outgoing args
1938   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1939 
1940   // Now the space for the inbound oop handle area
1941   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1942 
1943   int oop_handle_offset = stack_slots;
1944   stack_slots += total_save_slots;
1945 
1946   // Now any space we need for handlizing a klass if static method
1947 
1948   int klass_slot_offset = 0;
1949   int klass_offset = -1;
1950   int lock_slot_offset = 0;
1951   bool is_static = false;
1952 
1953   if (method->is_static()) {
1954     klass_slot_offset = stack_slots;
1955     stack_slots += VMRegImpl::slots_per_word;
1956     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1957     is_static = true;
1958   }
1959 
1960   // Plus a lock if needed
1961 
1962   if (method->is_synchronized()) {
1963     lock_slot_offset = stack_slots;
1964     stack_slots += VMRegImpl::slots_per_word;
1965   }
1966 
1967   // Now a place (+2) to save return values or temp during shuffling
1968   // + 4 for return address (which we own) and saved rbp
1969   stack_slots += 6;
1970 
1971   // Ok The space we have allocated will look like:
1972   //
1973   //
1974   // FP-> |                     |
1975   //      |---------------------|
1976   //      | 2 slots for moves   |
1977   //      |---------------------|
1978   //      | lock box (if sync)  |
1979   //      |---------------------| <- lock_slot_offset
1980   //      | klass (if static)   |
1981   //      |---------------------| <- klass_slot_offset
1982   //      | oopHandle area      |
1983   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1984   //      | outbound memory     |
1985   //      | based arguments     |
1986   //      |                     |
1987   //      |---------------------|
1988   //      |                     |
1989   // SP-> | out_preserved_slots |
1990   //
1991   //
1992 
1993 
1994   // Now compute actual number of stack words we need rounding to make
1995   // stack properly aligned.
1996   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1997 
1998   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1999 
2000   // First thing make an ic check to see if we should even be here
2001 
2002   // We are free to use all registers as temps without saving them and
2003   // restoring them except rbp. rbp is the only callee save register
2004   // as far as the interpreter and the compiler(s) are concerned.
2005 
2006   const Register receiver = j_rarg0;
2007 
2008   Label exception_pending;
2009 
2010   assert_different_registers(receiver, rscratch1, rscratch2);
2011   __ verify_oop(receiver);
2012   __ ic_check(8 /* end_alignment */);
2013 
2014   int vep_offset = ((intptr_t)__ pc()) - start;
2015 
2016   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2017     Label L_skip_barrier;
2018     Register klass = r10;
2019     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2020     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2021 
2022     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2023 
2024     __ bind(L_skip_barrier);
2025   }
2026 
2027 #ifdef COMPILER1
2028   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2029   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2030     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2031   }
2032 #endif // COMPILER1
2033 
2034   // The instruction at the verified entry point must be 5 bytes or longer
2035   // because it can be patched on the fly by make_non_entrant. The stack bang
2036   // instruction fits that requirement.
2037 
2038   // Generate stack overflow check
2039   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2040 
2041   // Generate a new frame for the wrapper.
2042   __ enter();
2043   // -2 because return address is already present and so is saved rbp
2044   __ subptr(rsp, stack_size - 2*wordSize);
2045 
2046   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2047   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2048   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2049 
2050   // Frame is now completed as far as size and linkage.
2051   int frame_complete = ((intptr_t)__ pc()) - start;
2052 
2053 #ifdef ASSERT
2054   __ check_stack_alignment(rsp, "improperly aligned stack");
2055 #endif /* ASSERT */
2056 
2057 
2058   // We use r14 as the oop handle for the receiver/klass
2059   // It is callee save so it survives the call to native
2060 
2061   const Register oop_handle_reg = r14;
2062 
2063   //
2064   // We immediately shuffle the arguments so that any vm call we have to
2065   // make from here on out (sync slow path, jvmti, etc.) we will have
2066   // captured the oops from our caller and have a valid oopMap for
2067   // them.
2068 
2069   // -----------------
2070   // The Grand Shuffle
2071 
2072   // The Java calling convention is either equal (linux) or denser (win64) than the
2073   // c calling convention. However the because of the jni_env argument the c calling
2074   // convention always has at least one more (and two for static) arguments than Java.
2075   // Therefore if we move the args from java -> c backwards then we will never have
2076   // a register->register conflict and we don't have to build a dependency graph
2077   // and figure out how to break any cycles.
2078   //
2079 
2080   // Record esp-based slot for receiver on stack for non-static methods
2081   int receiver_offset = -1;
2082 
2083   // This is a trick. We double the stack slots so we can claim
2084   // the oops in the caller's frame. Since we are sure to have
2085   // more args than the caller doubling is enough to make
2086   // sure we can capture all the incoming oop args from the
2087   // caller.
2088   //
2089   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2090 
2091   // Mark location of rbp (someday)
2092   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2093 
2094   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2095   // All inbound args are referenced based on rbp and all outbound args via rsp.
2096 
2097 
2098 #ifdef ASSERT
2099   bool reg_destroyed[Register::number_of_registers];
2100   bool freg_destroyed[XMMRegister::number_of_registers];
2101   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2102     reg_destroyed[r] = false;
2103   }
2104   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2105     freg_destroyed[f] = false;
2106   }
2107 
2108 #endif /* ASSERT */
2109 
2110   // For JNI natives the incoming and outgoing registers are offset upwards.
2111   GrowableArray<int> arg_order(2 * total_in_args);
2112 
2113   VMRegPair tmp_vmreg;
2114   tmp_vmreg.set2(rbx->as_VMReg());
2115 
2116   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2117     arg_order.push(i);
2118     arg_order.push(c_arg);
2119   }
2120 
2121   int temploc = -1;
2122   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2123     int i = arg_order.at(ai);
2124     int c_arg = arg_order.at(ai + 1);
2125     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2126 #ifdef ASSERT
2127     if (in_regs[i].first()->is_Register()) {
2128       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2129     } else if (in_regs[i].first()->is_XMMRegister()) {
2130       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2131     }
2132     if (out_regs[c_arg].first()->is_Register()) {
2133       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2134     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2135       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2136     }
2137 #endif /* ASSERT */
2138     switch (in_sig_bt[i]) {
2139       case T_ARRAY:
2140       case T_OBJECT:
2141         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2142                     ((i == 0) && (!is_static)),
2143                     &receiver_offset);
2144         break;
2145       case T_VOID:
2146         break;
2147 
2148       case T_FLOAT:
2149         __ float_move(in_regs[i], out_regs[c_arg]);
2150           break;
2151 
2152       case T_DOUBLE:
2153         assert( i + 1 < total_in_args &&
2154                 in_sig_bt[i + 1] == T_VOID &&
2155                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2156         __ double_move(in_regs[i], out_regs[c_arg]);
2157         break;
2158 
2159       case T_LONG :
2160         __ long_move(in_regs[i], out_regs[c_arg]);
2161         break;
2162 
2163       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2164 
2165       default:
2166         __ move32_64(in_regs[i], out_regs[c_arg]);
2167     }
2168   }
2169 
2170   int c_arg;
2171 
2172   // Pre-load a static method's oop into r14.  Used both by locking code and
2173   // the normal JNI call code.
2174   // point c_arg at the first arg that is already loaded in case we
2175   // need to spill before we call out
2176   c_arg = total_c_args - total_in_args;
2177 
2178   if (method->is_static()) {
2179 
2180     //  load oop into a register
2181     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2182 
2183     // Now handlize the static class mirror it's known not-null.
2184     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2185     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2186 
2187     // Now get the handle
2188     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2189     // store the klass handle as second argument
2190     __ movptr(c_rarg1, oop_handle_reg);
2191     // and protect the arg if we must spill
2192     c_arg--;
2193   }
2194 
2195   // Change state to native (we save the return address in the thread, since it might not
2196   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2197   // points into the right code segment. It does not have to be the correct return pc.
2198   // We use the same pc/oopMap repeatedly when we call out
2199 
2200   intptr_t the_pc = (intptr_t) __ pc();
2201   oop_maps->add_gc_map(the_pc - start, map);
2202 
2203   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2204 
2205 
2206   // We have all of the arguments setup at this point. We must not touch any register
2207   // argument registers at this point (what if we save/restore them there are no oop?
2208 
2209   if (DTraceMethodProbes) {
2210     // protect the args we've loaded
2211     save_args(masm, total_c_args, c_arg, out_regs);
2212     __ mov_metadata(c_rarg1, method());
2213     __ call_VM_leaf(
2214       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2215       r15_thread, c_rarg1);
2216     restore_args(masm, total_c_args, c_arg, out_regs);
2217   }
2218 
2219   // RedefineClasses() tracing support for obsolete method entry
2220   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2221     // protect the args we've loaded
2222     save_args(masm, total_c_args, c_arg, out_regs);
2223     __ mov_metadata(c_rarg1, method());
2224     __ call_VM_leaf(
2225       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2226       r15_thread, c_rarg1);
2227     restore_args(masm, total_c_args, c_arg, out_regs);
2228   }
2229 
2230   // Lock a synchronized method
2231 
2232   // Register definitions used by locking and unlocking
2233 
2234   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2235   const Register obj_reg  = rbx;  // Will contain the oop
2236   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2237   const Register old_hdr  = r13;  // value of old header at unlock time
2238 
2239   Label slow_path_lock;
2240   Label lock_done;
2241 
2242   if (method->is_synchronized()) {
2243     Label count_mon;
2244 
2245     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2246 
2247     // Get the handle (the 2nd argument)
2248     __ mov(oop_handle_reg, c_rarg1);
2249 
2250     // Get address of the box
2251 
2252     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2253 
2254     // Load the oop from the handle
2255     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2256 
2257     if (LockingMode == LM_MONITOR) {
2258       __ jmp(slow_path_lock);
2259     } else if (LockingMode == LM_LEGACY) {
2260       // Load immediate 1 into swap_reg %rax
2261       __ movl(swap_reg, 1);
2262 
2263       // Load (object->mark() | 1) into swap_reg %rax
2264       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2265 
2266       // Save (object->mark() | 1) into BasicLock's displaced header
2267       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2268 
2269       // src -> dest iff dest == rax else rax <- dest
2270       __ lock();
2271       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2272       __ jcc(Assembler::equal, count_mon);
2273 
2274       // Hmm should this move to the slow path code area???
2275 
2276       // Test if the oopMark is an obvious stack pointer, i.e.,
2277       //  1) (mark & 3) == 0, and
2278       //  2) rsp <= mark < mark + os::pagesize()
2279       // These 3 tests can be done by evaluating the following
2280       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2281       // assuming both stack pointer and pagesize have their
2282       // least significant 2 bits clear.
2283       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2284 
2285       __ subptr(swap_reg, rsp);
2286       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2287 
2288       // Save the test result, for recursive case, the result is zero
2289       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2290       __ jcc(Assembler::notEqual, slow_path_lock);
2291     } else {
2292       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2293       __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2294     }
2295     __ jmp (lock_done);
2296 
2297     __ bind(count_mon);
2298     __ inc_held_monitor_count();
2299 
2300     // Slow path will re-enter here
2301     __ bind(lock_done);
2302   }
2303 
2304   // Finally just about ready to make the JNI call
2305 
2306   // get JNIEnv* which is first argument to native
2307   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2308 
2309   // Now set thread in native
2310   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2311 
2312   __ call(RuntimeAddress(native_func));
2313 
2314   // Verify or restore cpu control state after JNI call
2315   __ restore_cpu_control_state_after_jni(rscratch1);
2316 
2317   // Unpack native results.
2318   switch (ret_type) {
2319   case T_BOOLEAN: __ c2bool(rax);            break;
2320   case T_CHAR   : __ movzwl(rax, rax);      break;
2321   case T_BYTE   : __ sign_extend_byte (rax); break;
2322   case T_SHORT  : __ sign_extend_short(rax); break;
2323   case T_INT    : /* nothing to do */        break;
2324   case T_DOUBLE :
2325   case T_FLOAT  :
2326     // Result is in xmm0 we'll save as needed
2327     break;
2328   case T_ARRAY:                 // Really a handle
2329   case T_OBJECT:                // Really a handle
2330       break; // can't de-handlize until after safepoint check
2331   case T_VOID: break;
2332   case T_LONG: break;
2333   default       : ShouldNotReachHere();
2334   }
2335 
2336   Label after_transition;
2337 
2338   // Switch thread to "native transition" state before reading the synchronization state.
2339   // This additional state is necessary because reading and testing the synchronization
2340   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2341   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2342   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2343   //     Thread A is resumed to finish this native method, but doesn't block here since it
2344   //     didn't see any synchronization is progress, and escapes.
2345   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2346 
2347   // Force this write out before the read below
2348   if (!UseSystemMemoryBarrier) {
2349     __ membar(Assembler::Membar_mask_bits(
2350               Assembler::LoadLoad | Assembler::LoadStore |
2351               Assembler::StoreLoad | Assembler::StoreStore));
2352   }
2353 
2354   // check for safepoint operation in progress and/or pending suspend requests
2355   {
2356     Label Continue;
2357     Label slow_path;
2358 
2359     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2360 
2361     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2362     __ jcc(Assembler::equal, Continue);
2363     __ bind(slow_path);
2364 
2365     // Don't use call_VM as it will see a possible pending exception and forward it
2366     // and never return here preventing us from clearing _last_native_pc down below.
2367     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2368     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2369     // by hand.
2370     //
2371     __ vzeroupper();
2372     save_native_result(masm, ret_type, stack_slots);
2373     __ mov(c_rarg0, r15_thread);
2374     __ mov(r12, rsp); // remember sp
2375     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2376     __ andptr(rsp, -16); // align stack as required by ABI
2377     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2378     __ mov(rsp, r12); // restore sp
2379     __ reinit_heapbase();
2380     // Restore any method result value
2381     restore_native_result(masm, ret_type, stack_slots);
2382     __ bind(Continue);
2383   }
2384 
2385   // change thread state
2386   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2387   __ bind(after_transition);
2388 
2389   // Check preemption for Object.wait()
2390   if (method->is_object_wait0()) {
2391     Label not_preempted;
2392     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2393     __ cmpptr(rscratch1, NULL_WORD);
2394     __ jccb(Assembler::equal, not_preempted);
2395     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2396     __ jmp(rscratch1);
2397     __ bind(not_preempted);
2398   }
2399   int resume_wait_offset = ((intptr_t)__ pc()) - start;
2400 
2401   Label reguard;
2402   Label reguard_done;
2403   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2404   __ jcc(Assembler::equal, reguard);
2405   __ bind(reguard_done);
2406 
2407   // native result if any is live
2408 
2409   // Unlock
2410   Label slow_path_unlock;
2411   Label unlock_done;
2412   if (method->is_synchronized()) {
2413 
2414     Label fast_done;
2415 
2416     // Get locked oop from the handle we passed to jni
2417     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2418 
2419     if (LockingMode == LM_LEGACY) {
2420       Label not_recur;
2421       // Simple recursive lock?
2422       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2423       __ jcc(Assembler::notEqual, not_recur);
2424       __ jmpb(fast_done);
2425       __ bind(not_recur);
2426     }
2427 
2428     // Must save rax if it is live now because cmpxchg must use it
2429     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2430       save_native_result(masm, ret_type, stack_slots);
2431     }
2432 
2433     if (LockingMode == LM_MONITOR) {
2434       __ jmp(slow_path_unlock);
2435     } else if (LockingMode == LM_LEGACY) {
2436       // get address of the stack lock
2437       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2438       //  get old displaced header
2439       __ movptr(old_hdr, Address(rax, 0));
2440 
2441       // Atomic swap old header if oop still contains the stack lock
2442       __ lock();
2443       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2444       __ jcc(Assembler::notEqual, slow_path_unlock);
2445       __ dec_held_monitor_count();
2446     } else {
2447       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2448       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2449     }
2450 
2451     // slow path re-enters here
2452     __ bind(unlock_done);
2453     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2454       restore_native_result(masm, ret_type, stack_slots);
2455     }
2456 
2457     __ bind(fast_done);
2458   }
2459   if (DTraceMethodProbes) {
2460     save_native_result(masm, ret_type, stack_slots);
2461     __ mov_metadata(c_rarg1, method());
2462     __ call_VM_leaf(
2463          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2464          r15_thread, c_rarg1);
2465     restore_native_result(masm, ret_type, stack_slots);
2466   }
2467 
2468   __ reset_last_Java_frame(false);
2469 
2470   // Unbox oop result, e.g. JNIHandles::resolve value.
2471   if (is_reference_type(ret_type)) {
2472     __ resolve_jobject(rax /* value */,
2473                        r15_thread /* thread */,
2474                        rcx /* tmp */);
2475   }
2476 
2477   if (CheckJNICalls) {
2478     // clear_pending_jni_exception_check
2479     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2480   }
2481 
2482   // reset handle block
2483   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2484   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2485 
2486   // pop our frame
2487 
2488   __ leave();
2489 
2490   // Any exception pending?
2491   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2492   __ jcc(Assembler::notEqual, exception_pending);
2493 
2494   // Return
2495 
2496   __ ret(0);
2497 
2498   // Unexpected paths are out of line and go here
2499 
2500   // forward the exception
2501   __ bind(exception_pending);
2502 
2503   // and forward the exception
2504   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2505 
2506   // Slow path locking & unlocking
2507   if (method->is_synchronized()) {
2508 
2509     // BEGIN Slow path lock
2510     __ bind(slow_path_lock);
2511 
2512     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2513     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2514 
2515     // protect the args we've loaded
2516     save_args(masm, total_c_args, c_arg, out_regs);
2517 
2518     __ mov(c_rarg0, obj_reg);
2519     __ mov(c_rarg1, lock_reg);
2520     __ mov(c_rarg2, r15_thread);
2521 
2522     // Not a leaf but we have last_Java_frame setup as we want
2523     // Force freeze slow path on ObjectMonitor::enter() for now which will fail with freeze_pinned_native.
2524     __ push_cont_fastpath();
2525     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2526     __ pop_cont_fastpath();
2527     restore_args(masm, total_c_args, c_arg, out_regs);
2528 
2529 #ifdef ASSERT
2530     { Label L;
2531     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2532     __ jcc(Assembler::equal, L);
2533     __ stop("no pending exception allowed on exit from monitorenter");
2534     __ bind(L);
2535     }
2536 #endif
2537     __ jmp(lock_done);
2538 
2539     // END Slow path lock
2540 
2541     // BEGIN Slow path unlock
2542     __ bind(slow_path_unlock);
2543 
2544     // If we haven't already saved the native result we must save it now as xmm registers
2545     // are still exposed.
2546     __ vzeroupper();
2547     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2548       save_native_result(masm, ret_type, stack_slots);
2549     }
2550 
2551     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2552 
2553     __ mov(c_rarg0, obj_reg);
2554     __ mov(c_rarg2, r15_thread);
2555     __ mov(r12, rsp); // remember sp
2556     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2557     __ andptr(rsp, -16); // align stack as required by ABI
2558 
2559     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2560     // NOTE that obj_reg == rbx currently
2561     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2562     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2563 
2564     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2565     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2566     __ mov(rsp, r12); // restore sp
2567     __ reinit_heapbase();
2568 #ifdef ASSERT
2569     {
2570       Label L;
2571       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2572       __ jcc(Assembler::equal, L);
2573       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2574       __ bind(L);
2575     }
2576 #endif /* ASSERT */
2577 
2578     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2579 
2580     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2581       restore_native_result(masm, ret_type, stack_slots);
2582     }
2583     __ jmp(unlock_done);
2584 
2585     // END Slow path unlock
2586 
2587   } // synchronized
2588 
2589   // SLOW PATH Reguard the stack if needed
2590 
2591   __ bind(reguard);
2592   __ vzeroupper();
2593   save_native_result(masm, ret_type, stack_slots);
2594   __ mov(r12, rsp); // remember sp
2595   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2596   __ andptr(rsp, -16); // align stack as required by ABI
2597   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2598   __ mov(rsp, r12); // restore sp
2599   __ reinit_heapbase();
2600   restore_native_result(masm, ret_type, stack_slots);
2601   // and continue
2602   __ jmp(reguard_done);
2603 
2604 
2605 
2606   __ flush();
2607 
2608   nmethod *nm = nmethod::new_native_nmethod(method,
2609                                             compile_id,
2610                                             masm->code(),
2611                                             vep_offset,
2612                                             frame_complete,
2613                                             stack_slots / VMRegImpl::slots_per_word,
2614                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2615                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2616                                             oop_maps);
2617 
2618   if (nm != nullptr && method->is_object_wait0()) {
2619     SharedRuntime::set_native_frame_resume_entry(nm->code_begin() + resume_wait_offset);
2620   }
2621 
2622   return nm;
2623 }
2624 
2625 // this function returns the adjust size (in number of words) to a c2i adapter
2626 // activation for use during deoptimization
2627 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2628   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2629 }
2630 
2631 
2632 uint SharedRuntime::out_preserve_stack_slots() {
2633   return 0;
2634 }
2635 
2636 
2637 // Number of stack slots between incoming argument block and the start of
2638 // a new frame.  The PROLOG must add this many slots to the stack.  The
2639 // EPILOG must remove this many slots.  amd64 needs two slots for
2640 // return address.
2641 uint SharedRuntime::in_preserve_stack_slots() {
2642   return 4 + 2 * VerifyStackAtCalls;
2643 }
2644 
2645 VMReg SharedRuntime::thread_register() {
2646   return r15_thread->as_VMReg();
2647 }
2648 
2649 //------------------------------generate_deopt_blob----------------------------
2650 void SharedRuntime::generate_deopt_blob() {
2651   // Allocate space for the code
2652   ResourceMark rm;
2653   // Setup code generation tools
2654   int pad = 0;
2655   if (UseAVX > 2) {
2656     pad += 1024;
2657   }
2658   if (UseAPX) {
2659     pad += 1024;
2660   }
2661 #if INCLUDE_JVMCI
2662   if (EnableJVMCI) {
2663     pad += 512; // Increase the buffer size when compiling for JVMCI
2664   }
2665 #endif
2666   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2667   MacroAssembler* masm = new MacroAssembler(&buffer);
2668   int frame_size_in_words;
2669   OopMap* map = nullptr;
2670   OopMapSet *oop_maps = new OopMapSet();
2671 
2672   // -------------
2673   // This code enters when returning to a de-optimized nmethod.  A return
2674   // address has been pushed on the stack, and return values are in
2675   // registers.
2676   // If we are doing a normal deopt then we were called from the patched
2677   // nmethod from the point we returned to the nmethod. So the return
2678   // address on the stack is wrong by NativeCall::instruction_size
2679   // We will adjust the value so it looks like we have the original return
2680   // address on the stack (like when we eagerly deoptimized).
2681   // In the case of an exception pending when deoptimizing, we enter
2682   // with a return address on the stack that points after the call we patched
2683   // into the exception handler. We have the following register state from,
2684   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2685   //    rax: exception oop
2686   //    rbx: exception handler
2687   //    rdx: throwing pc
2688   // So in this case we simply jam rdx into the useless return address and
2689   // the stack looks just like we want.
2690   //
2691   // At this point we need to de-opt.  We save the argument return
2692   // registers.  We call the first C routine, fetch_unroll_info().  This
2693   // routine captures the return values and returns a structure which
2694   // describes the current frame size and the sizes of all replacement frames.
2695   // The current frame is compiled code and may contain many inlined
2696   // functions, each with their own JVM state.  We pop the current frame, then
2697   // push all the new frames.  Then we call the C routine unpack_frames() to
2698   // populate these frames.  Finally unpack_frames() returns us the new target
2699   // address.  Notice that callee-save registers are BLOWN here; they have
2700   // already been captured in the vframeArray at the time the return PC was
2701   // patched.
2702   address start = __ pc();
2703   Label cont;
2704 
2705   // Prolog for non exception case!
2706 
2707   // Save everything in sight.
2708   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2709 
2710   // Normal deoptimization.  Save exec mode for unpack_frames.
2711   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2712   __ jmp(cont);
2713 
2714   int reexecute_offset = __ pc() - start;
2715 #if INCLUDE_JVMCI && !defined(COMPILER1)
2716   if (EnableJVMCI && UseJVMCICompiler) {
2717     // JVMCI does not use this kind of deoptimization
2718     __ should_not_reach_here();
2719   }
2720 #endif
2721 
2722   // Reexecute case
2723   // return address is the pc describes what bci to do re-execute at
2724 
2725   // No need to update map as each call to save_live_registers will produce identical oopmap
2726   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2727 
2728   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2729   __ jmp(cont);
2730 
2731 #if INCLUDE_JVMCI
2732   Label after_fetch_unroll_info_call;
2733   int implicit_exception_uncommon_trap_offset = 0;
2734   int uncommon_trap_offset = 0;
2735 
2736   if (EnableJVMCI) {
2737     implicit_exception_uncommon_trap_offset = __ pc() - start;
2738 
2739     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2740     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2741 
2742     uncommon_trap_offset = __ pc() - start;
2743 
2744     // Save everything in sight.
2745     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2746     // fetch_unroll_info needs to call last_java_frame()
2747     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2748 
2749     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2750     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2751 
2752     __ movl(r14, Deoptimization::Unpack_reexecute);
2753     __ mov(c_rarg0, r15_thread);
2754     __ movl(c_rarg2, r14); // exec mode
2755     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2756     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2757 
2758     __ reset_last_Java_frame(false);
2759 
2760     __ jmp(after_fetch_unroll_info_call);
2761   } // EnableJVMCI
2762 #endif // INCLUDE_JVMCI
2763 
2764   int exception_offset = __ pc() - start;
2765 
2766   // Prolog for exception case
2767 
2768   // all registers are dead at this entry point, except for rax, and
2769   // rdx which contain the exception oop and exception pc
2770   // respectively.  Set them in TLS and fall thru to the
2771   // unpack_with_exception_in_tls entry point.
2772 
2773   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2774   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2775 
2776   int exception_in_tls_offset = __ pc() - start;
2777 
2778   // new implementation because exception oop is now passed in JavaThread
2779 
2780   // Prolog for exception case
2781   // All registers must be preserved because they might be used by LinearScan
2782   // Exceptiop oop and throwing PC are passed in JavaThread
2783   // tos: stack at point of call to method that threw the exception (i.e. only
2784   // args are on the stack, no return address)
2785 
2786   // make room on stack for the return address
2787   // It will be patched later with the throwing pc. The correct value is not
2788   // available now because loading it from memory would destroy registers.
2789   __ push(0);
2790 
2791   // Save everything in sight.
2792   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2793 
2794   // Now it is safe to overwrite any register
2795 
2796   // Deopt during an exception.  Save exec mode for unpack_frames.
2797   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2798 
2799   // load throwing pc from JavaThread and patch it as the return address
2800   // of the current frame. Then clear the field in JavaThread
2801 
2802   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2803   __ movptr(Address(rbp, wordSize), rdx);
2804   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2805 
2806 #ifdef ASSERT
2807   // verify that there is really an exception oop in JavaThread
2808   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2809   __ verify_oop(rax);
2810 
2811   // verify that there is no pending exception
2812   Label no_pending_exception;
2813   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2814   __ testptr(rax, rax);
2815   __ jcc(Assembler::zero, no_pending_exception);
2816   __ stop("must not have pending exception here");
2817   __ bind(no_pending_exception);
2818 #endif
2819 
2820   __ bind(cont);
2821 
2822   // Call C code.  Need thread and this frame, but NOT official VM entry
2823   // crud.  We cannot block on this call, no GC can happen.
2824   //
2825   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2826 
2827   // fetch_unroll_info needs to call last_java_frame().
2828 
2829   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2830 #ifdef ASSERT
2831   { Label L;
2832     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2833     __ jcc(Assembler::equal, L);
2834     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2835     __ bind(L);
2836   }
2837 #endif // ASSERT
2838   __ mov(c_rarg0, r15_thread);
2839   __ movl(c_rarg1, r14); // exec_mode
2840   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2841 
2842   // Need to have an oopmap that tells fetch_unroll_info where to
2843   // find any register it might need.
2844   oop_maps->add_gc_map(__ pc() - start, map);
2845 
2846   __ reset_last_Java_frame(false);
2847 
2848 #if INCLUDE_JVMCI
2849   if (EnableJVMCI) {
2850     __ bind(after_fetch_unroll_info_call);
2851   }
2852 #endif
2853 
2854   // Load UnrollBlock* into rdi
2855   __ mov(rdi, rax);
2856 
2857   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2858    Label noException;
2859   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2860   __ jcc(Assembler::notEqual, noException);
2861   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2862   // QQQ this is useless it was null above
2863   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2864   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2865   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2866 
2867   __ verify_oop(rax);
2868 
2869   // Overwrite the result registers with the exception results.
2870   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2871   // I think this is useless
2872   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2873 
2874   __ bind(noException);
2875 
2876   // Only register save data is on the stack.
2877   // Now restore the result registers.  Everything else is either dead
2878   // or captured in the vframeArray.
2879   RegisterSaver::restore_result_registers(masm);
2880 
2881   // All of the register save area has been popped of the stack. Only the
2882   // return address remains.
2883 
2884   // Pop all the frames we must move/replace.
2885   //
2886   // Frame picture (youngest to oldest)
2887   // 1: self-frame (no frame link)
2888   // 2: deopting frame  (no frame link)
2889   // 3: caller of deopting frame (could be compiled/interpreted).
2890   //
2891   // Note: by leaving the return address of self-frame on the stack
2892   // and using the size of frame 2 to adjust the stack
2893   // when we are done the return to frame 3 will still be on the stack.
2894 
2895   // Pop deoptimized frame
2896   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2897   __ addptr(rsp, rcx);
2898 
2899   // rsp should be pointing at the return address to the caller (3)
2900 
2901   // Pick up the initial fp we should save
2902   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2903   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2904 
2905 #ifdef ASSERT
2906   // Compilers generate code that bang the stack by as much as the
2907   // interpreter would need. So this stack banging should never
2908   // trigger a fault. Verify that it does not on non product builds.
2909   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2910   __ bang_stack_size(rbx, rcx);
2911 #endif
2912 
2913   // Load address of array of frame pcs into rcx
2914   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2915 
2916   // Trash the old pc
2917   __ addptr(rsp, wordSize);
2918 
2919   // Load address of array of frame sizes into rsi
2920   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2921 
2922   // Load counter into rdx
2923   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2924 
2925   // Now adjust the caller's stack to make up for the extra locals
2926   // but record the original sp so that we can save it in the skeletal interpreter
2927   // frame and the stack walking of interpreter_sender will get the unextended sp
2928   // value and not the "real" sp value.
2929 
2930   const Register sender_sp = r8;
2931 
2932   __ mov(sender_sp, rsp);
2933   __ movl(rbx, Address(rdi,
2934                        Deoptimization::UnrollBlock::
2935                        caller_adjustment_offset()));
2936   __ subptr(rsp, rbx);
2937 
2938   // Push interpreter frames in a loop
2939   Label loop;
2940   __ bind(loop);
2941   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2942   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2943   __ pushptr(Address(rcx, 0));          // Save return address
2944   __ enter();                           // Save old & set new ebp
2945   __ subptr(rsp, rbx);                  // Prolog
2946   // This value is corrected by layout_activation_impl
2947   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2948   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2949   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2950   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2951   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2952   __ decrementl(rdx);                   // Decrement counter
2953   __ jcc(Assembler::notZero, loop);
2954   __ pushptr(Address(rcx, 0));          // Save final return address
2955 
2956   // Re-push self-frame
2957   __ enter();                           // Save old & set new ebp
2958 
2959   // Allocate a full sized register save area.
2960   // Return address and rbp are in place, so we allocate two less words.
2961   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2962 
2963   // Restore frame locals after moving the frame
2964   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2965   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2966 
2967   // Call C code.  Need thread but NOT official VM entry
2968   // crud.  We cannot block on this call, no GC can happen.  Call should
2969   // restore return values to their stack-slots with the new SP.
2970   //
2971   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2972 
2973   // Use rbp because the frames look interpreted now
2974   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2975   // Don't need the precise return PC here, just precise enough to point into this code blob.
2976   address the_pc = __ pc();
2977   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2978 
2979   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2980   __ mov(c_rarg0, r15_thread);
2981   __ movl(c_rarg1, r14); // second arg: exec_mode
2982   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2983   // Revert SP alignment after call since we're going to do some SP relative addressing below
2984   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2985 
2986   // Set an oopmap for the call site
2987   // Use the same PC we used for the last java frame
2988   oop_maps->add_gc_map(the_pc - start,
2989                        new OopMap( frame_size_in_words, 0 ));
2990 
2991   // Clear fp AND pc
2992   __ reset_last_Java_frame(true);
2993 
2994   // Collect return values
2995   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2996   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2997   // I think this is useless (throwing pc?)
2998   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2999 
3000   // Pop self-frame.
3001   __ leave();                           // Epilog
3002 
3003   // Jump to interpreter
3004   __ ret(0);
3005 
3006   // Make sure all code is generated
3007   masm->flush();
3008 
3009   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3010   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3011 #if INCLUDE_JVMCI
3012   if (EnableJVMCI) {
3013     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3014     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3015   }
3016 #endif
3017 }
3018 
3019 #ifdef COMPILER2
3020 //------------------------------generate_uncommon_trap_blob--------------------
3021 void SharedRuntime::generate_uncommon_trap_blob() {
3022   // Allocate space for the code
3023   ResourceMark rm;
3024   // Setup code generation tools
3025   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
3026   MacroAssembler* masm = new MacroAssembler(&buffer);
3027 
3028   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3029 
3030   address start = __ pc();
3031 
3032   // Push self-frame.  We get here with a return address on the
3033   // stack, so rsp is 8-byte aligned until we allocate our frame.
3034   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
3035 
3036   // No callee saved registers. rbp is assumed implicitly saved
3037   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3038 
3039   // compiler left unloaded_class_index in j_rarg0 move to where the
3040   // runtime expects it.
3041   __ movl(c_rarg1, j_rarg0);
3042 
3043   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3044 
3045   // Call C code.  Need thread but NOT official VM entry
3046   // crud.  We cannot block on this call, no GC can happen.  Call should
3047   // capture callee-saved registers as well as return values.
3048   // Thread is in rdi already.
3049   //
3050   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
3051 
3052   __ mov(c_rarg0, r15_thread);
3053   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
3054   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3055 
3056   // Set an oopmap for the call site
3057   OopMapSet* oop_maps = new OopMapSet();
3058   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
3059 
3060   // location of rbp is known implicitly by the frame sender code
3061 
3062   oop_maps->add_gc_map(__ pc() - start, map);
3063 
3064   __ reset_last_Java_frame(false);
3065 
3066   // Load UnrollBlock* into rdi
3067   __ mov(rdi, rax);
3068 
3069 #ifdef ASSERT
3070   { Label L;
3071     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
3072               Deoptimization::Unpack_uncommon_trap);
3073     __ jcc(Assembler::equal, L);
3074     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
3075     __ bind(L);
3076   }
3077 #endif
3078 
3079   // Pop all the frames we must move/replace.
3080   //
3081   // Frame picture (youngest to oldest)
3082   // 1: self-frame (no frame link)
3083   // 2: deopting frame  (no frame link)
3084   // 3: caller of deopting frame (could be compiled/interpreted).
3085 
3086   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
3087   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
3088 
3089   // Pop deoptimized frame (int)
3090   __ movl(rcx, Address(rdi,
3091                        Deoptimization::UnrollBlock::
3092                        size_of_deoptimized_frame_offset()));
3093   __ addptr(rsp, rcx);
3094 
3095   // rsp should be pointing at the return address to the caller (3)
3096 
3097   // Pick up the initial fp we should save
3098   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3099   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3100 
3101 #ifdef ASSERT
3102   // Compilers generate code that bang the stack by as much as the
3103   // interpreter would need. So this stack banging should never
3104   // trigger a fault. Verify that it does not on non product builds.
3105   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3106   __ bang_stack_size(rbx, rcx);
3107 #endif
3108 
3109   // Load address of array of frame pcs into rcx (address*)
3110   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3111 
3112   // Trash the return pc
3113   __ addptr(rsp, wordSize);
3114 
3115   // Load address of array of frame sizes into rsi (intptr_t*)
3116   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
3117 
3118   // Counter
3119   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
3120 
3121   // Now adjust the caller's stack to make up for the extra locals but
3122   // record the original sp so that we can save it in the skeletal
3123   // interpreter frame and the stack walking of interpreter_sender
3124   // will get the unextended sp value and not the "real" sp value.
3125 
3126   const Register sender_sp = r8;
3127 
3128   __ mov(sender_sp, rsp);
3129   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
3130   __ subptr(rsp, rbx);
3131 
3132   // Push interpreter frames in a loop
3133   Label loop;
3134   __ bind(loop);
3135   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3136   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3137   __ pushptr(Address(rcx, 0));     // Save return address
3138   __ enter();                      // Save old & set new rbp
3139   __ subptr(rsp, rbx);             // Prolog
3140   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3141             sender_sp);            // Make it walkable
3142   // This value is corrected by layout_activation_impl
3143   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3144   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3145   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3146   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3147   __ decrementl(rdx);              // Decrement counter
3148   __ jcc(Assembler::notZero, loop);
3149   __ pushptr(Address(rcx, 0));     // Save final return address
3150 
3151   // Re-push self-frame
3152   __ enter();                 // Save old & set new rbp
3153   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3154                               // Prolog
3155 
3156   // Use rbp because the frames look interpreted now
3157   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3158   // Don't need the precise return PC here, just precise enough to point into this code blob.
3159   address the_pc = __ pc();
3160   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3161 
3162   // Call C code.  Need thread but NOT official VM entry
3163   // crud.  We cannot block on this call, no GC can happen.  Call should
3164   // restore return values to their stack-slots with the new SP.
3165   // Thread is in rdi already.
3166   //
3167   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3168 
3169   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3170   __ mov(c_rarg0, r15_thread);
3171   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3172   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3173 
3174   // Set an oopmap for the call site
3175   // Use the same PC we used for the last java frame
3176   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3177 
3178   // Clear fp AND pc
3179   __ reset_last_Java_frame(true);
3180 
3181   // Pop self-frame.
3182   __ leave();                 // Epilog
3183 
3184   // Jump to interpreter
3185   __ ret(0);
3186 
3187   // Make sure all code is generated
3188   masm->flush();
3189 
3190   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3191                                                  SimpleRuntimeFrame::framesize >> 1);
3192 }
3193 #endif // COMPILER2
3194 
3195 //------------------------------generate_handler_blob------
3196 //
3197 // Generate a special Compile2Runtime blob that saves all registers,
3198 // and setup oopmap.
3199 //
3200 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3201   assert(StubRoutines::forward_exception_entry() != nullptr,
3202          "must be generated before");
3203 
3204   ResourceMark rm;
3205   OopMapSet *oop_maps = new OopMapSet();
3206   OopMap* map;
3207 
3208   // Allocate space for the code.  Setup code generation tools.
3209   CodeBuffer buffer("handler_blob", 2348, 1024);
3210   MacroAssembler* masm = new MacroAssembler(&buffer);
3211 
3212   address start   = __ pc();
3213   address call_pc = nullptr;
3214   int frame_size_in_words;
3215   bool cause_return = (poll_type == POLL_AT_RETURN);
3216   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3217 
3218   // Make room for return address (or push it again)
3219   if (!cause_return) {
3220     __ push(rbx);
3221   }
3222 
3223   // Save registers, fpu state, and flags
3224   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3225 
3226   // The following is basically a call_VM.  However, we need the precise
3227   // address of the call in order to generate an oopmap. Hence, we do all the
3228   // work ourselves.
3229 
3230   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3231 
3232   // The return address must always be correct so that frame constructor never
3233   // sees an invalid pc.
3234 
3235   if (!cause_return) {
3236     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3237     // Additionally, rbx is a callee saved register and we can look at it later to determine
3238     // if someone changed the return address for us!
3239     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3240     __ movptr(Address(rbp, wordSize), rbx);
3241   }
3242 
3243   // Do the call
3244   __ mov(c_rarg0, r15_thread);
3245   __ call(RuntimeAddress(call_ptr));
3246 
3247   // Set an oopmap for the call site.  This oopmap will map all
3248   // oop-registers and debug-info registers as callee-saved.  This
3249   // will allow deoptimization at this safepoint to find all possible
3250   // debug-info recordings, as well as let GC find all oops.
3251 
3252   oop_maps->add_gc_map( __ pc() - start, map);
3253 
3254   Label noException;
3255 
3256   __ reset_last_Java_frame(false);
3257 
3258   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3259   __ jcc(Assembler::equal, noException);
3260 
3261   // Exception pending
3262 
3263   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3264 
3265   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3266 
3267   // No exception case
3268   __ bind(noException);
3269 
3270   Label no_adjust;
3271 #ifdef ASSERT
3272   Label bail;
3273 #endif
3274   if (!cause_return) {
3275     Label no_prefix, not_special;
3276 
3277     // If our stashed return pc was modified by the runtime we avoid touching it
3278     __ cmpptr(rbx, Address(rbp, wordSize));
3279     __ jccb(Assembler::notEqual, no_adjust);
3280 
3281     // Skip over the poll instruction.
3282     // See NativeInstruction::is_safepoint_poll()
3283     // Possible encodings:
3284     //      85 00       test   %eax,(%rax)
3285     //      85 01       test   %eax,(%rcx)
3286     //      85 02       test   %eax,(%rdx)
3287     //      85 03       test   %eax,(%rbx)
3288     //      85 06       test   %eax,(%rsi)
3289     //      85 07       test   %eax,(%rdi)
3290     //
3291     //   41 85 00       test   %eax,(%r8)
3292     //   41 85 01       test   %eax,(%r9)
3293     //   41 85 02       test   %eax,(%r10)
3294     //   41 85 03       test   %eax,(%r11)
3295     //   41 85 06       test   %eax,(%r14)
3296     //   41 85 07       test   %eax,(%r15)
3297     //
3298     //      85 04 24    test   %eax,(%rsp)
3299     //   41 85 04 24    test   %eax,(%r12)
3300     //      85 45 00    test   %eax,0x0(%rbp)
3301     //   41 85 45 00    test   %eax,0x0(%r13)
3302 
3303     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3304     __ jcc(Assembler::notEqual, no_prefix);
3305     __ addptr(rbx, 1);
3306     __ bind(no_prefix);
3307 #ifdef ASSERT
3308     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3309 #endif
3310     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3311     // r12/rsp 0x04
3312     // r13/rbp 0x05
3313     __ movzbq(rcx, Address(rbx, 1));
3314     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3315     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3316     __ cmpptr(rcx, 1);
3317     __ jcc(Assembler::above, not_special);
3318     __ addptr(rbx, 1);
3319     __ bind(not_special);
3320 #ifdef ASSERT
3321     // Verify the correct encoding of the poll we're about to skip.
3322     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3323     __ jcc(Assembler::notEqual, bail);
3324     // Mask out the modrm bits
3325     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3326     // rax encodes to 0, so if the bits are nonzero it's incorrect
3327     __ jcc(Assembler::notZero, bail);
3328 #endif
3329     // Adjust return pc forward to step over the safepoint poll instruction
3330     __ addptr(rbx, 2);
3331     __ movptr(Address(rbp, wordSize), rbx);
3332   }
3333 
3334   __ bind(no_adjust);
3335   // Normal exit, restore registers and exit.
3336   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3337   __ ret(0);
3338 
3339 #ifdef ASSERT
3340   __ bind(bail);
3341   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3342 #endif
3343 
3344   // Make sure all code is generated
3345   masm->flush();
3346 
3347   // Fill-out other meta info
3348   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3349 }
3350 
3351 //
3352 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3353 //
3354 // Generate a stub that calls into vm to find out the proper destination
3355 // of a java call. All the argument registers are live at this point
3356 // but since this is generic code we don't know what they are and the caller
3357 // must do any gc of the args.
3358 //
3359 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3360   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3361 
3362   // allocate space for the code
3363   ResourceMark rm;
3364 
3365   CodeBuffer buffer(name, 1552, 512);
3366   MacroAssembler* masm = new MacroAssembler(&buffer);
3367 
3368   int frame_size_in_words;
3369 
3370   OopMapSet *oop_maps = new OopMapSet();
3371   OopMap* map = nullptr;
3372 
3373   int start = __ offset();
3374 
3375   // No need to save vector registers since they are caller-saved anyway.
3376   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3377 
3378   int frame_complete = __ offset();
3379 
3380   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3381 
3382   __ mov(c_rarg0, r15_thread);
3383 
3384   __ call(RuntimeAddress(destination));
3385 
3386 
3387   // Set an oopmap for the call site.
3388   // We need this not only for callee-saved registers, but also for volatile
3389   // registers that the compiler might be keeping live across a safepoint.
3390 
3391   oop_maps->add_gc_map( __ offset() - start, map);
3392 
3393   // rax contains the address we are going to jump to assuming no exception got installed
3394 
3395   // clear last_Java_sp
3396   __ reset_last_Java_frame(false);
3397   // check for pending exceptions
3398   Label pending;
3399   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3400   __ jcc(Assembler::notEqual, pending);
3401 
3402   // get the returned Method*
3403   __ get_vm_result_2(rbx, r15_thread);
3404   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3405 
3406   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3407 
3408   RegisterSaver::restore_live_registers(masm);
3409 
3410   // We are back to the original state on entry and ready to go.
3411 
3412   __ jmp(rax);
3413 
3414   // Pending exception after the safepoint
3415 
3416   __ bind(pending);
3417 
3418   RegisterSaver::restore_live_registers(masm);
3419 
3420   // exception pending => remove activation and forward to exception handler
3421 
3422   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3423 
3424   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3425   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3426 
3427   // -------------
3428   // make sure all code is generated
3429   masm->flush();
3430 
3431   // return the  blob
3432   // frame_size_words or bytes??
3433   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3434 }
3435 
3436 //------------------------------Montgomery multiplication------------------------
3437 //
3438 
3439 #ifndef _WINDOWS
3440 
3441 // Subtract 0:b from carry:a.  Return carry.
3442 static julong
3443 sub(julong a[], julong b[], julong carry, long len) {
3444   long long i = 0, cnt = len;
3445   julong tmp;
3446   asm volatile("clc; "
3447                "0: ; "
3448                "mov (%[b], %[i], 8), %[tmp]; "
3449                "sbb %[tmp], (%[a], %[i], 8); "
3450                "inc %[i]; dec %[cnt]; "
3451                "jne 0b; "
3452                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3453                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3454                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3455                : "memory");
3456   return tmp;
3457 }
3458 
3459 // Multiply (unsigned) Long A by Long B, accumulating the double-
3460 // length result into the accumulator formed of T0, T1, and T2.
3461 #define MACC(A, B, T0, T1, T2)                                  \
3462 do {                                                            \
3463   unsigned long hi, lo;                                         \
3464   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3465            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3466            : "r"(A), "a"(B) : "cc");                            \
3467  } while(0)
3468 
3469 // As above, but add twice the double-length result into the
3470 // accumulator.
3471 #define MACC2(A, B, T0, T1, T2)                                 \
3472 do {                                                            \
3473   unsigned long hi, lo;                                         \
3474   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3475            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3476            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3477            : "r"(A), "a"(B) : "cc");                            \
3478  } while(0)
3479 
3480 #else //_WINDOWS
3481 
3482 static julong
3483 sub(julong a[], julong b[], julong carry, long len) {
3484   long i;
3485   julong tmp;
3486   unsigned char c = 1;
3487   for (i = 0; i < len; i++) {
3488     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3489     a[i] = tmp;
3490   }
3491   c = _addcarry_u64(c, carry, ~0, &tmp);
3492   return tmp;
3493 }
3494 
3495 // Multiply (unsigned) Long A by Long B, accumulating the double-
3496 // length result into the accumulator formed of T0, T1, and T2.
3497 #define MACC(A, B, T0, T1, T2)                          \
3498 do {                                                    \
3499   julong hi, lo;                            \
3500   lo = _umul128(A, B, &hi);                             \
3501   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3502   c = _addcarry_u64(c, hi, T1, &T1);                    \
3503   _addcarry_u64(c, T2, 0, &T2);                         \
3504  } while(0)
3505 
3506 // As above, but add twice the double-length result into the
3507 // accumulator.
3508 #define MACC2(A, B, T0, T1, T2)                         \
3509 do {                                                    \
3510   julong hi, lo;                            \
3511   lo = _umul128(A, B, &hi);                             \
3512   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3513   c = _addcarry_u64(c, hi, T1, &T1);                    \
3514   _addcarry_u64(c, T2, 0, &T2);                         \
3515   c = _addcarry_u64(0, lo, T0, &T0);                    \
3516   c = _addcarry_u64(c, hi, T1, &T1);                    \
3517   _addcarry_u64(c, T2, 0, &T2);                         \
3518  } while(0)
3519 
3520 #endif //_WINDOWS
3521 
3522 // Fast Montgomery multiplication.  The derivation of the algorithm is
3523 // in  A Cryptographic Library for the Motorola DSP56000,
3524 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3525 
3526 static void NOINLINE
3527 montgomery_multiply(julong a[], julong b[], julong n[],
3528                     julong m[], julong inv, int len) {
3529   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3530   int i;
3531 
3532   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3533 
3534   for (i = 0; i < len; i++) {
3535     int j;
3536     for (j = 0; j < i; j++) {
3537       MACC(a[j], b[i-j], t0, t1, t2);
3538       MACC(m[j], n[i-j], t0, t1, t2);
3539     }
3540     MACC(a[i], b[0], t0, t1, t2);
3541     m[i] = t0 * inv;
3542     MACC(m[i], n[0], t0, t1, t2);
3543 
3544     assert(t0 == 0, "broken Montgomery multiply");
3545 
3546     t0 = t1; t1 = t2; t2 = 0;
3547   }
3548 
3549   for (i = len; i < 2*len; i++) {
3550     int j;
3551     for (j = i-len+1; j < len; j++) {
3552       MACC(a[j], b[i-j], t0, t1, t2);
3553       MACC(m[j], n[i-j], t0, t1, t2);
3554     }
3555     m[i-len] = t0;
3556     t0 = t1; t1 = t2; t2 = 0;
3557   }
3558 
3559   while (t0)
3560     t0 = sub(m, n, t0, len);
3561 }
3562 
3563 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3564 // multiplies so it should be up to 25% faster than Montgomery
3565 // multiplication.  However, its loop control is more complex and it
3566 // may actually run slower on some machines.
3567 
3568 static void NOINLINE
3569 montgomery_square(julong a[], julong n[],
3570                   julong m[], julong inv, int len) {
3571   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3572   int i;
3573 
3574   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3575 
3576   for (i = 0; i < len; i++) {
3577     int j;
3578     int end = (i+1)/2;
3579     for (j = 0; j < end; j++) {
3580       MACC2(a[j], a[i-j], t0, t1, t2);
3581       MACC(m[j], n[i-j], t0, t1, t2);
3582     }
3583     if ((i & 1) == 0) {
3584       MACC(a[j], a[j], t0, t1, t2);
3585     }
3586     for (; j < i; j++) {
3587       MACC(m[j], n[i-j], t0, t1, t2);
3588     }
3589     m[i] = t0 * inv;
3590     MACC(m[i], n[0], t0, t1, t2);
3591 
3592     assert(t0 == 0, "broken Montgomery square");
3593 
3594     t0 = t1; t1 = t2; t2 = 0;
3595   }
3596 
3597   for (i = len; i < 2*len; i++) {
3598     int start = i-len+1;
3599     int end = start + (len - start)/2;
3600     int j;
3601     for (j = start; j < end; j++) {
3602       MACC2(a[j], a[i-j], t0, t1, t2);
3603       MACC(m[j], n[i-j], t0, t1, t2);
3604     }
3605     if ((i & 1) == 0) {
3606       MACC(a[j], a[j], t0, t1, t2);
3607     }
3608     for (; j < len; j++) {
3609       MACC(m[j], n[i-j], t0, t1, t2);
3610     }
3611     m[i-len] = t0;
3612     t0 = t1; t1 = t2; t2 = 0;
3613   }
3614 
3615   while (t0)
3616     t0 = sub(m, n, t0, len);
3617 }
3618 
3619 // Swap words in a longword.
3620 static julong swap(julong x) {
3621   return (x << 32) | (x >> 32);
3622 }
3623 
3624 // Copy len longwords from s to d, word-swapping as we go.  The
3625 // destination array is reversed.
3626 static void reverse_words(julong *s, julong *d, int len) {
3627   d += len;
3628   while(len-- > 0) {
3629     d--;
3630     *d = swap(*s);
3631     s++;
3632   }
3633 }
3634 
3635 // The threshold at which squaring is advantageous was determined
3636 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3637 #define MONTGOMERY_SQUARING_THRESHOLD 64
3638 
3639 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3640                                         jint len, jlong inv,
3641                                         jint *m_ints) {
3642   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3643   int longwords = len/2;
3644 
3645   // Make very sure we don't use so much space that the stack might
3646   // overflow.  512 jints corresponds to an 16384-bit integer and
3647   // will use here a total of 8k bytes of stack space.
3648   int divisor = sizeof(julong) * 4;
3649   guarantee(longwords <= 8192 / divisor, "must be");
3650   int total_allocation = longwords * sizeof (julong) * 4;
3651   julong *scratch = (julong *)alloca(total_allocation);
3652 
3653   // Local scratch arrays
3654   julong
3655     *a = scratch + 0 * longwords,
3656     *b = scratch + 1 * longwords,
3657     *n = scratch + 2 * longwords,
3658     *m = scratch + 3 * longwords;
3659 
3660   reverse_words((julong *)a_ints, a, longwords);
3661   reverse_words((julong *)b_ints, b, longwords);
3662   reverse_words((julong *)n_ints, n, longwords);
3663 
3664   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3665 
3666   reverse_words(m, (julong *)m_ints, longwords);
3667 }
3668 
3669 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3670                                       jint len, jlong inv,
3671                                       jint *m_ints) {
3672   assert(len % 2 == 0, "array length in montgomery_square must be even");
3673   int longwords = len/2;
3674 
3675   // Make very sure we don't use so much space that the stack might
3676   // overflow.  512 jints corresponds to an 16384-bit integer and
3677   // will use here a total of 6k bytes of stack space.
3678   int divisor = sizeof(julong) * 3;
3679   guarantee(longwords <= (8192 / divisor), "must be");
3680   int total_allocation = longwords * sizeof (julong) * 3;
3681   julong *scratch = (julong *)alloca(total_allocation);
3682 
3683   // Local scratch arrays
3684   julong
3685     *a = scratch + 0 * longwords,
3686     *n = scratch + 1 * longwords,
3687     *m = scratch + 2 * longwords;
3688 
3689   reverse_words((julong *)a_ints, a, longwords);
3690   reverse_words((julong *)n_ints, n, longwords);
3691 
3692   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3693     ::montgomery_square(a, n, m, (julong)inv, longwords);
3694   } else {
3695     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3696   }
3697 
3698   reverse_words(m, (julong *)m_ints, longwords);
3699 }
3700 
3701 #ifdef COMPILER2
3702 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3703 //
3704 //------------------------------generate_exception_blob---------------------------
3705 // creates exception blob at the end
3706 // Using exception blob, this code is jumped from a compiled method.
3707 // (see emit_exception_handler in x86_64.ad file)
3708 //
3709 // Given an exception pc at a call we call into the runtime for the
3710 // handler in this method. This handler might merely restore state
3711 // (i.e. callee save registers) unwind the frame and jump to the
3712 // exception handler for the nmethod if there is no Java level handler
3713 // for the nmethod.
3714 //
3715 // This code is entered with a jmp.
3716 //
3717 // Arguments:
3718 //   rax: exception oop
3719 //   rdx: exception pc
3720 //
3721 // Results:
3722 //   rax: exception oop
3723 //   rdx: exception pc in caller or ???
3724 //   destination: exception handler of caller
3725 //
3726 // Note: the exception pc MUST be at a call (precise debug information)
3727 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3728 //
3729 
3730 void OptoRuntime::generate_exception_blob() {
3731   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3732   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3733   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3734 
3735   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3736 
3737   // Allocate space for the code
3738   ResourceMark rm;
3739   // Setup code generation tools
3740   CodeBuffer buffer("exception_blob", 2048, 1024);
3741   MacroAssembler* masm = new MacroAssembler(&buffer);
3742 
3743 
3744   address start = __ pc();
3745 
3746   // Exception pc is 'return address' for stack walker
3747   __ push(rdx);
3748   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3749 
3750   // Save callee-saved registers.  See x86_64.ad.
3751 
3752   // rbp is an implicitly saved callee saved register (i.e., the calling
3753   // convention will save/restore it in the prolog/epilog). Other than that
3754   // there are no callee save registers now that adapter frames are gone.
3755 
3756   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3757 
3758   // Store exception in Thread object. We cannot pass any arguments to the
3759   // handle_exception call, since we do not want to make any assumption
3760   // about the size of the frame where the exception happened in.
3761   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3762   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3763   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3764 
3765   // This call does all the hard work.  It checks if an exception handler
3766   // exists in the method.
3767   // If so, it returns the handler address.
3768   // If not, it prepares for stack-unwinding, restoring the callee-save
3769   // registers of the frame being removed.
3770   //
3771   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3772 
3773   // At a method handle call, the stack may not be properly aligned
3774   // when returning with an exception.
3775   address the_pc = __ pc();
3776   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3777   __ mov(c_rarg0, r15_thread);
3778   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3779   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3780 
3781   // Set an oopmap for the call site.  This oopmap will only be used if we
3782   // are unwinding the stack.  Hence, all locations will be dead.
3783   // Callee-saved registers will be the same as the frame above (i.e.,
3784   // handle_exception_stub), since they were restored when we got the
3785   // exception.
3786 
3787   OopMapSet* oop_maps = new OopMapSet();
3788 
3789   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3790 
3791   __ reset_last_Java_frame(false);
3792 
3793   // Restore callee-saved registers
3794 
3795   // rbp is an implicitly saved callee-saved register (i.e., the calling
3796   // convention will save restore it in prolog/epilog) Other than that
3797   // there are no callee save registers now that adapter frames are gone.
3798 
3799   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3800 
3801   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3802   __ pop(rdx);                  // No need for exception pc anymore
3803 
3804   // rax: exception handler
3805 
3806   // We have a handler in rax (could be deopt blob).
3807   __ mov(r8, rax);
3808 
3809   // Get the exception oop
3810   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3811   // Get the exception pc in case we are deoptimized
3812   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3813 #ifdef ASSERT
3814   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3815   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3816 #endif
3817   // Clear the exception oop so GC no longer processes it as a root.
3818   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3819 
3820   // rax: exception oop
3821   // r8:  exception handler
3822   // rdx: exception pc
3823   // Jump to handler
3824 
3825   __ jmp(r8);
3826 
3827   // Make sure all code is generated
3828   masm->flush();
3829 
3830   // Set exception blob
3831   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3832 }
3833 #endif // COMPILER2