1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/vframeArray.hpp"
  56 #include "runtime/vm_version.hpp"
  57 #include "utilities/align.hpp"
  58 #include "utilities/checkedCast.hpp"
  59 #include "utilities/formatBuffer.hpp"
  60 #include "vmreg_x86.inline.hpp"
  61 #ifdef COMPILER1
  62 #include "c1/c1_Runtime1.hpp"
  63 #endif
  64 #ifdef COMPILER2
  65 #include "opto/runtime.hpp"
  66 #endif
  67 #if INCLUDE_JVMCI
  68 #include "jvmci/jvmciJavaClasses.hpp"
  69 #endif
  70 
  71 #define __ masm->
  72 
  73 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  74 
  75 class SimpleRuntimeFrame {
  76 
  77   public:
  78 
  79   // Most of the runtime stubs have this simple frame layout.
  80   // This class exists to make the layout shared in one place.
  81   // Offsets are for compiler stack slots, which are jints.
  82   enum layout {
  83     // The frame sender code expects that rbp will be in the "natural" place and
  84     // will override any oopMap setting for it. We must therefore force the layout
  85     // so that it agrees with the frame sender code.
  86     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  87     rbp_off2,
  88     return_off, return_off2,
  89     framesize
  90   };
  91 };
  92 
  93 class RegisterSaver {
  94   // Capture info about frame layout.  Layout offsets are in jint
  95   // units because compiler frame slots are jints.
  96 #define XSAVE_AREA_BEGIN 160
  97 #define XSAVE_AREA_YMM_BEGIN 576
  98 #define XSAVE_AREA_EGPRS 960
  99 #define XSAVE_AREA_OPMASK_BEGIN 1088
 100 #define XSAVE_AREA_ZMM_BEGIN 1152
 101 #define XSAVE_AREA_UPPERBANK 1664
 102 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 103 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 104 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 105 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 107   enum layout {
 108     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
 109     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
 110     DEF_XMM_OFFS(0),
 111     DEF_XMM_OFFS(1),
 112     // 2..15 are implied in range usage
 113     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 114     DEF_YMM_OFFS(0),
 115     DEF_YMM_OFFS(1),
 116     // 2..15 are implied in range usage
 117     r31_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 118     r31H_off,
 119     r30_off, r30H_off,
 120     r29_off, r29H_off,
 121     r28_off, r28H_off,
 122     r27_off, r27H_off,
 123     r26_off, r26H_off,
 124     r25_off, r25H_off,
 125     r24_off, r24H_off,
 126     r23_off, r23H_off,
 127     r22_off, r22H_off,
 128     r21_off, r21H_off,
 129     r20_off, r20H_off,
 130     r19_off, r19H_off,
 131     r18_off, r18H_off,
 132     r17_off, r17H_off,
 133     r16_off, r16H_off,
 134     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 135     DEF_OPMASK_OFFS(0),
 136     DEF_OPMASK_OFFS(1),
 137     // 2..7 are implied in range usage
 138     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 139     DEF_ZMM_OFFS(0),
 140     DEF_ZMM_OFFS(1),
 141     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 142     DEF_ZMM_UPPER_OFFS(16),
 143     DEF_ZMM_UPPER_OFFS(17),
 144     // 18..31 are implied in range usage
 145     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 146     fpu_stateH_end,
 147     r15_off, r15H_off,
 148     r14_off, r14H_off,
 149     r13_off, r13H_off,
 150     r12_off, r12H_off,
 151     r11_off, r11H_off,
 152     r10_off, r10H_off,
 153     r9_off,  r9H_off,
 154     r8_off,  r8H_off,
 155     rdi_off, rdiH_off,
 156     rsi_off, rsiH_off,
 157     ignore_off, ignoreH_off,  // extra copy of rbp
 158     rsp_off, rspH_off,
 159     rbx_off, rbxH_off,
 160     rdx_off, rdxH_off,
 161     rcx_off, rcxH_off,
 162     rax_off, raxH_off,
 163     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 164     align_off, alignH_off,
 165     flags_off, flagsH_off,
 166     // The frame sender code expects that rbp will be in the "natural" place and
 167     // will override any oopMap setting for it. We must therefore force the layout
 168     // so that it agrees with the frame sender code.
 169     rbp_off, rbpH_off,        // copy of rbp we will restore
 170     return_off, returnH_off,  // slot for return address
 171     reg_save_size             // size in compiler stack slots
 172   };
 173 
 174  public:
 175   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 176   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 177 
 178   // Offsets into the register save area
 179   // Used by deoptimization when it is managing result register
 180   // values on its own
 181 
 182   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 183   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 184   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 185   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 186   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 187 
 188   // During deoptimization only the result registers need to be restored,
 189   // all the other values have already been extracted.
 190   static void restore_result_registers(MacroAssembler* masm);
 191 };
 192 
 193 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 194   int off = 0;
 195   int num_xmm_regs = XMMRegister::available_xmm_registers();
 196 #if COMPILER2_OR_JVMCI
 197   if (save_wide_vectors && UseAVX == 0) {
 198     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 199   }
 200   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 201 #else
 202   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 203 #endif
 204 
 205   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 206   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 207   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 208   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 209   // CodeBlob frame size is in words.
 210   int frame_size_in_words = frame_size_in_bytes / wordSize;
 211   *total_frame_words = frame_size_in_words;
 212 
 213   // Save registers, fpu state, and flags.
 214   // We assume caller has already pushed the return address onto the
 215   // stack, so rsp is 8-byte aligned here.
 216   // We push rpb twice in this sequence because we want the real rbp
 217   // to be under the return like a normal enter.
 218 
 219   __ enter();          // rsp becomes 16-byte aligned here
 220   __ pushf();
 221   // Make sure rsp stays 16-byte aligned
 222   __ subq(rsp, 8);
 223   // Push CPU state in multiple of 16 bytes
 224   __ save_legacy_gprs();
 225   __ push_FPU_state();
 226 
 227 
 228   // push cpu state handles this on EVEX enabled targets
 229   if (save_wide_vectors) {
 230     // Save upper half of YMM registers(0..15)
 231     int base_addr = XSAVE_AREA_YMM_BEGIN;
 232     for (int n = 0; n < 16; n++) {
 233       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 234     }
 235     if (VM_Version::supports_evex()) {
 236       // Save upper half of ZMM registers(0..15)
 237       base_addr = XSAVE_AREA_ZMM_BEGIN;
 238       for (int n = 0; n < 16; n++) {
 239         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 240       }
 241       // Save full ZMM registers(16..num_xmm_regs)
 242       base_addr = XSAVE_AREA_UPPERBANK;
 243       off = 0;
 244       int vector_len = Assembler::AVX_512bit;
 245       for (int n = 16; n < num_xmm_regs; n++) {
 246         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 247       }
 248 #if COMPILER2_OR_JVMCI
 249       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 250       off = 0;
 251       for(int n = 0; n < KRegister::number_of_registers; n++) {
 252         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 253       }
 254 #endif
 255     }
 256   } else {
 257     if (VM_Version::supports_evex()) {
 258       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 259       int base_addr = XSAVE_AREA_UPPERBANK;
 260       off = 0;
 261       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 262       for (int n = 16; n < num_xmm_regs; n++) {
 263         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 264       }
 265 #if COMPILER2_OR_JVMCI
 266       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 267       off = 0;
 268       for(int n = 0; n < KRegister::number_of_registers; n++) {
 269         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 270       }
 271 #endif
 272     }
 273   }
 274 
 275 #if COMPILER2_OR_JVMCI
 276   if (UseAPX) {
 277       int base_addr = XSAVE_AREA_EGPRS;
 278       off = 0;
 279       for(int n = 16; n < Register::number_of_registers; n++) {
 280         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 281       }
 282   }
 283 #endif
 284 
 285   __ vzeroupper();
 286   if (frame::arg_reg_save_area_bytes != 0) {
 287     // Allocate argument register save area
 288     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 289   }
 290 
 291   // Set an oopmap for the call site.  This oopmap will map all
 292   // oop-registers and debug-info registers as callee-saved.  This
 293   // will allow deoptimization at this safepoint to find all possible
 294   // debug-info recordings, as well as let GC find all oops.
 295 
 296   OopMapSet *oop_maps = new OopMapSet();
 297   OopMap* map = new OopMap(frame_size_in_slots, 0);
 298 
 299 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 300 
 301   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 305   // rbp location is known implicitly by the frame sender code, needs no oopmap
 306   // and the location where rbp was saved by is ignored
 307   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 308   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 309   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 310   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 311   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 312   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 313   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 314   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 315   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 316   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 317 
 318   if (UseAPX) {
 319     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 324     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 325     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 326     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 327     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 328     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 329     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 330     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 331     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 332     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 333     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 334     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 335   }
 336   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 337   // on EVEX enabled targets, we get it included in the xsave area
 338   off = xmm0_off;
 339   int delta = xmm1_off - off;
 340   for (int n = 0; n < 16; n++) {
 341     XMMRegister xmm_name = as_XMMRegister(n);
 342     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 343     off += delta;
 344   }
 345   if (UseAVX > 2) {
 346     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 347     off = zmm16_off;
 348     delta = zmm17_off - off;
 349     for (int n = 16; n < num_xmm_regs; n++) {
 350       XMMRegister zmm_name = as_XMMRegister(n);
 351       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 352       off += delta;
 353     }
 354   }
 355 
 356 #if COMPILER2_OR_JVMCI
 357   if (save_wide_vectors) {
 358     // Save upper half of YMM registers(0..15)
 359     off = ymm0_off;
 360     delta = ymm1_off - ymm0_off;
 361     for (int n = 0; n < 16; n++) {
 362       XMMRegister ymm_name = as_XMMRegister(n);
 363       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 364       off += delta;
 365     }
 366     if (VM_Version::supports_evex()) {
 367       // Save upper half of ZMM registers(0..15)
 368       off = zmm0_off;
 369       delta = zmm1_off - zmm0_off;
 370       for (int n = 0; n < 16; n++) {
 371         XMMRegister zmm_name = as_XMMRegister(n);
 372         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 373         off += delta;
 374       }
 375     }
 376   }
 377 #endif // COMPILER2_OR_JVMCI
 378 
 379   // %%% These should all be a waste but we'll keep things as they were for now
 380   if (true) {
 381     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 385     // rbp location is known implicitly by the frame sender code, needs no oopmap
 386     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 387     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 388     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 389     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 390     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 391     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 392     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 393     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 394     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 395     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 396     if (UseAPX) {
 397       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 402       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 403       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 404       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 405       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 406       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 407       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 408       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 409       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 410       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 411       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 412       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 413     }
 414     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 415     // on EVEX enabled targets, we get it included in the xsave area
 416     off = xmm0H_off;
 417     delta = xmm1H_off - off;
 418     for (int n = 0; n < 16; n++) {
 419       XMMRegister xmm_name = as_XMMRegister(n);
 420       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 421       off += delta;
 422     }
 423     if (UseAVX > 2) {
 424       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 425       off = zmm16H_off;
 426       delta = zmm17H_off - off;
 427       for (int n = 16; n < num_xmm_regs; n++) {
 428         XMMRegister zmm_name = as_XMMRegister(n);
 429         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 430         off += delta;
 431       }
 432     }
 433   }
 434 
 435   return map;
 436 }
 437 
 438 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 439   int num_xmm_regs = XMMRegister::available_xmm_registers();
 440   if (frame::arg_reg_save_area_bytes != 0) {
 441     // Pop arg register save area
 442     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 443   }
 444 
 445 #if COMPILER2_OR_JVMCI
 446   if (restore_wide_vectors) {
 447     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 448     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 449   }
 450 #else
 451   assert(!restore_wide_vectors, "vectors are generated only by C2");
 452 #endif
 453 
 454   __ vzeroupper();
 455 
 456   // On EVEX enabled targets everything is handled in pop fpu state
 457   if (restore_wide_vectors) {
 458     // Restore upper half of YMM registers (0..15)
 459     int base_addr = XSAVE_AREA_YMM_BEGIN;
 460     for (int n = 0; n < 16; n++) {
 461       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 462     }
 463     if (VM_Version::supports_evex()) {
 464       // Restore upper half of ZMM registers (0..15)
 465       base_addr = XSAVE_AREA_ZMM_BEGIN;
 466       for (int n = 0; n < 16; n++) {
 467         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 468       }
 469       // Restore full ZMM registers(16..num_xmm_regs)
 470       base_addr = XSAVE_AREA_UPPERBANK;
 471       int vector_len = Assembler::AVX_512bit;
 472       int off = 0;
 473       for (int n = 16; n < num_xmm_regs; n++) {
 474         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 475       }
 476 #if COMPILER2_OR_JVMCI
 477       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 478       off = 0;
 479       for (int n = 0; n < KRegister::number_of_registers; n++) {
 480         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 481       }
 482 #endif
 483     }
 484   } else {
 485     if (VM_Version::supports_evex()) {
 486       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 487       int base_addr = XSAVE_AREA_UPPERBANK;
 488       int off = 0;
 489       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 490       for (int n = 16; n < num_xmm_regs; n++) {
 491         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 492       }
 493 #if COMPILER2_OR_JVMCI
 494       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 495       off = 0;
 496       for (int n = 0; n < KRegister::number_of_registers; n++) {
 497         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 498       }
 499 #endif
 500     }
 501   }
 502 
 503 #if COMPILER2_OR_JVMCI
 504   if (UseAPX) {
 505     int base_addr = XSAVE_AREA_EGPRS;
 506     int off = 0;
 507     for (int n = 16; n < Register::number_of_registers; n++) {
 508       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 509     }
 510   }
 511 #endif
 512 
 513   // Recover CPU state
 514   __ pop_FPU_state();
 515   __ restore_legacy_gprs();
 516   __ addq(rsp, 8);
 517   __ popf();
 518   // Get the rbp described implicitly by the calling convention (no oopMap)
 519   __ pop(rbp);
 520 }
 521 
 522 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 523 
 524   // Just restore result register. Only used by deoptimization. By
 525   // now any callee save register that needs to be restored to a c2
 526   // caller of the deoptee has been extracted into the vframeArray
 527   // and will be stuffed into the c2i adapter we create for later
 528   // restoration so only result registers need to be restored here.
 529 
 530   // Restore fp result register
 531   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 532   // Restore integer result register
 533   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 534   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 535 
 536   // Pop all of the register save are off the stack except the return address
 537   __ addptr(rsp, return_offset_in_bytes());
 538 }
 539 
 540 // Is vector's size (in bytes) bigger than a size saved by default?
 541 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 542 bool SharedRuntime::is_wide_vector(int size) {
 543   return size > 16;
 544 }
 545 
 546 // ---------------------------------------------------------------------------
 547 // Read the array of BasicTypes from a signature, and compute where the
 548 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 549 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 550 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 551 // as framesizes are fixed.
 552 // VMRegImpl::stack0 refers to the first slot 0(sp).
 553 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 554 // Register up to Register::number_of_registers are the 64-bit
 555 // integer registers.
 556 
 557 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 558 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 559 // units regardless of build. Of course for i486 there is no 64 bit build
 560 
 561 // The Java calling convention is a "shifted" version of the C ABI.
 562 // By skipping the first C ABI register we can call non-static jni methods
 563 // with small numbers of arguments without having to shuffle the arguments
 564 // at all. Since we control the java ABI we ought to at least get some
 565 // advantage out of it.
 566 
 567 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 568                                            VMRegPair *regs,
 569                                            int total_args_passed) {
 570 
 571   // Create the mapping between argument positions and
 572   // registers.
 573   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 574     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 575   };
 576   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 577     j_farg0, j_farg1, j_farg2, j_farg3,
 578     j_farg4, j_farg5, j_farg6, j_farg7
 579   };
 580 
 581 
 582   uint int_args = 0;
 583   uint fp_args = 0;
 584   uint stk_args = 0;
 585 
 586   for (int i = 0; i < total_args_passed; i++) {
 587     switch (sig_bt[i]) {
 588     case T_BOOLEAN:
 589     case T_CHAR:
 590     case T_BYTE:
 591     case T_SHORT:
 592     case T_INT:
 593       if (int_args < Argument::n_int_register_parameters_j) {
 594         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 595       } else {
 596         stk_args = align_up(stk_args, 2);
 597         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 598         stk_args += 1;
 599       }
 600       break;
 601     case T_VOID:
 602       // halves of T_LONG or T_DOUBLE
 603       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 604       regs[i].set_bad();
 605       break;
 606     case T_LONG:
 607       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 608       // fall through
 609     case T_OBJECT:
 610     case T_ARRAY:
 611     case T_ADDRESS:
 612       if (int_args < Argument::n_int_register_parameters_j) {
 613         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 614       } else {
 615         stk_args = align_up(stk_args, 2);
 616         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 617         stk_args += 2;
 618       }
 619       break;
 620     case T_FLOAT:
 621       if (fp_args < Argument::n_float_register_parameters_j) {
 622         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 623       } else {
 624         stk_args = align_up(stk_args, 2);
 625         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 626         stk_args += 1;
 627       }
 628       break;
 629     case T_DOUBLE:
 630       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 631       if (fp_args < Argument::n_float_register_parameters_j) {
 632         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 633       } else {
 634         stk_args = align_up(stk_args, 2);
 635         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 636         stk_args += 2;
 637       }
 638       break;
 639     default:
 640       ShouldNotReachHere();
 641       break;
 642     }
 643   }
 644 
 645   return stk_args;
 646 }
 647 
 648 // Patch the callers callsite with entry to compiled code if it exists.
 649 static void patch_callers_callsite(MacroAssembler *masm) {
 650   Label L;
 651   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 652   __ jcc(Assembler::equal, L);
 653 
 654   // Save the current stack pointer
 655   __ mov(r13, rsp);
 656   // Schedule the branch target address early.
 657   // Call into the VM to patch the caller, then jump to compiled callee
 658   // rax isn't live so capture return address while we easily can
 659   __ movptr(rax, Address(rsp, 0));
 660 
 661   // align stack so push_CPU_state doesn't fault
 662   __ andptr(rsp, -(StackAlignmentInBytes));
 663   __ push_CPU_state();
 664   __ vzeroupper();
 665   // VM needs caller's callsite
 666   // VM needs target method
 667   // This needs to be a long call since we will relocate this adapter to
 668   // the codeBuffer and it may not reach
 669 
 670   // Allocate argument register save area
 671   if (frame::arg_reg_save_area_bytes != 0) {
 672     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 673   }
 674   __ mov(c_rarg0, rbx);
 675   __ mov(c_rarg1, rax);
 676   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 677 
 678   // De-allocate argument register save area
 679   if (frame::arg_reg_save_area_bytes != 0) {
 680     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 681   }
 682 
 683   __ vzeroupper();
 684   __ pop_CPU_state();
 685   // restore sp
 686   __ mov(rsp, r13);
 687   __ bind(L);
 688 }
 689 
 690 
 691 static void gen_c2i_adapter(MacroAssembler *masm,
 692                             int total_args_passed,
 693                             int comp_args_on_stack,
 694                             const BasicType *sig_bt,
 695                             const VMRegPair *regs,
 696                             Label& skip_fixup) {
 697   // Before we get into the guts of the C2I adapter, see if we should be here
 698   // at all.  We've come from compiled code and are attempting to jump to the
 699   // interpreter, which means the caller made a static call to get here
 700   // (vcalls always get a compiled target if there is one).  Check for a
 701   // compiled target.  If there is one, we need to patch the caller's call.
 702   patch_callers_callsite(masm);
 703 
 704   __ bind(skip_fixup);
 705 
 706   // Since all args are passed on the stack, total_args_passed *
 707   // Interpreter::stackElementSize is the space we need.
 708 
 709   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 710 
 711   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 712 
 713   // stack is aligned, keep it that way
 714   // This is not currently needed or enforced by the interpreter, but
 715   // we might as well conform to the ABI.
 716   extraspace = align_up(extraspace, 2*wordSize);
 717 
 718   // set senderSP value
 719   __ lea(r13, Address(rsp, wordSize));
 720 
 721 #ifdef ASSERT
 722   __ check_stack_alignment(r13, "sender stack not aligned");
 723 #endif
 724   if (extraspace > 0) {
 725     // Pop the return address
 726     __ pop(rax);
 727 
 728     __ subptr(rsp, extraspace);
 729 
 730     // Push the return address
 731     __ push(rax);
 732 
 733     // Account for the return address location since we store it first rather
 734     // than hold it in a register across all the shuffling
 735     extraspace += wordSize;
 736   }
 737 
 738 #ifdef ASSERT
 739   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 740 #endif
 741 
 742   // Now write the args into the outgoing interpreter space
 743   for (int i = 0; i < total_args_passed; i++) {
 744     if (sig_bt[i] == T_VOID) {
 745       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 746       continue;
 747     }
 748 
 749     // offset to start parameters
 750     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 751     int next_off = st_off - Interpreter::stackElementSize;
 752 
 753     // Say 4 args:
 754     // i   st_off
 755     // 0   32 T_LONG
 756     // 1   24 T_VOID
 757     // 2   16 T_OBJECT
 758     // 3    8 T_BOOL
 759     // -    0 return address
 760     //
 761     // However to make thing extra confusing. Because we can fit a long/double in
 762     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 763     // leaves one slot empty and only stores to a single slot. In this case the
 764     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 765 
 766     VMReg r_1 = regs[i].first();
 767     VMReg r_2 = regs[i].second();
 768     if (!r_1->is_valid()) {
 769       assert(!r_2->is_valid(), "");
 770       continue;
 771     }
 772     if (r_1->is_stack()) {
 773       // memory to memory use rax
 774       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 775       if (!r_2->is_valid()) {
 776         // sign extend??
 777         __ movl(rax, Address(rsp, ld_off));
 778         __ movptr(Address(rsp, st_off), rax);
 779 
 780       } else {
 781 
 782         __ movq(rax, Address(rsp, ld_off));
 783 
 784         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 785         // T_DOUBLE and T_LONG use two slots in the interpreter
 786         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 787           // ld_off == LSW, ld_off+wordSize == MSW
 788           // st_off == MSW, next_off == LSW
 789           __ movq(Address(rsp, next_off), rax);
 790 #ifdef ASSERT
 791           // Overwrite the unused slot with known junk
 792           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 793           __ movptr(Address(rsp, st_off), rax);
 794 #endif /* ASSERT */
 795         } else {
 796           __ movq(Address(rsp, st_off), rax);
 797         }
 798       }
 799     } else if (r_1->is_Register()) {
 800       Register r = r_1->as_Register();
 801       if (!r_2->is_valid()) {
 802         // must be only an int (or less ) so move only 32bits to slot
 803         // why not sign extend??
 804         __ movl(Address(rsp, st_off), r);
 805       } else {
 806         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 807         // T_DOUBLE and T_LONG use two slots in the interpreter
 808         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 809           // long/double in gpr
 810 #ifdef ASSERT
 811           // Overwrite the unused slot with known junk
 812           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 813           __ movptr(Address(rsp, st_off), rax);
 814 #endif /* ASSERT */
 815           __ movq(Address(rsp, next_off), r);
 816         } else {
 817           __ movptr(Address(rsp, st_off), r);
 818         }
 819       }
 820     } else {
 821       assert(r_1->is_XMMRegister(), "");
 822       if (!r_2->is_valid()) {
 823         // only a float use just part of the slot
 824         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 825       } else {
 826 #ifdef ASSERT
 827         // Overwrite the unused slot with known junk
 828         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 829         __ movptr(Address(rsp, st_off), rax);
 830 #endif /* ASSERT */
 831         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 832       }
 833     }
 834   }
 835 
 836   // Schedule the branch target address early.
 837   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 838   __ jmp(rcx);
 839 }
 840 
 841 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 842                         address code_start, address code_end,
 843                         Label& L_ok) {
 844   Label L_fail;
 845   __ lea(temp_reg, ExternalAddress(code_start));
 846   __ cmpptr(pc_reg, temp_reg);
 847   __ jcc(Assembler::belowEqual, L_fail);
 848   __ lea(temp_reg, ExternalAddress(code_end));
 849   __ cmpptr(pc_reg, temp_reg);
 850   __ jcc(Assembler::below, L_ok);
 851   __ bind(L_fail);
 852 }
 853 
 854 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 855                                     int total_args_passed,
 856                                     int comp_args_on_stack,
 857                                     const BasicType *sig_bt,
 858                                     const VMRegPair *regs) {
 859 
 860   // Note: r13 contains the senderSP on entry. We must preserve it since
 861   // we may do a i2c -> c2i transition if we lose a race where compiled
 862   // code goes non-entrant while we get args ready.
 863   // In addition we use r13 to locate all the interpreter args as
 864   // we must align the stack to 16 bytes on an i2c entry else we
 865   // lose alignment we expect in all compiled code and register
 866   // save code can segv when fxsave instructions find improperly
 867   // aligned stack pointer.
 868 
 869   // Adapters can be frameless because they do not require the caller
 870   // to perform additional cleanup work, such as correcting the stack pointer.
 871   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 872   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 873   // even if a callee has modified the stack pointer.
 874   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 875   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 876   // up via the senderSP register).
 877   // In other words, if *either* the caller or callee is interpreted, we can
 878   // get the stack pointer repaired after a call.
 879   // This is why c2i and i2c adapters cannot be indefinitely composed.
 880   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 881   // both caller and callee would be compiled methods, and neither would
 882   // clean up the stack pointer changes performed by the two adapters.
 883   // If this happens, control eventually transfers back to the compiled
 884   // caller, but with an uncorrected stack, causing delayed havoc.
 885 
 886   if (VerifyAdapterCalls &&
 887       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 888     // So, let's test for cascading c2i/i2c adapters right now.
 889     //  assert(Interpreter::contains($return_addr) ||
 890     //         StubRoutines::contains($return_addr),
 891     //         "i2c adapter must return to an interpreter frame");
 892     __ block_comment("verify_i2c { ");
 893     // Pick up the return address
 894     __ movptr(rax, Address(rsp, 0));
 895     Label L_ok;
 896     if (Interpreter::code() != nullptr) {
 897       range_check(masm, rax, r11,
 898                   Interpreter::code()->code_start(),
 899                   Interpreter::code()->code_end(),
 900                   L_ok);
 901     }
 902     if (StubRoutines::initial_stubs_code() != nullptr) {
 903       range_check(masm, rax, r11,
 904                   StubRoutines::initial_stubs_code()->code_begin(),
 905                   StubRoutines::initial_stubs_code()->code_end(),
 906                   L_ok);
 907     }
 908     if (StubRoutines::final_stubs_code() != nullptr) {
 909       range_check(masm, rax, r11,
 910                   StubRoutines::final_stubs_code()->code_begin(),
 911                   StubRoutines::final_stubs_code()->code_end(),
 912                   L_ok);
 913     }
 914     const char* msg = "i2c adapter must return to an interpreter frame";
 915     __ block_comment(msg);
 916     __ stop(msg);
 917     __ bind(L_ok);
 918     __ block_comment("} verify_i2ce ");
 919   }
 920 
 921   // Must preserve original SP for loading incoming arguments because
 922   // we need to align the outgoing SP for compiled code.
 923   __ movptr(r11, rsp);
 924 
 925   // Pick up the return address
 926   __ pop(rax);
 927 
 928   // Convert 4-byte c2 stack slots to words.
 929   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 930 
 931   if (comp_args_on_stack) {
 932     __ subptr(rsp, comp_words_on_stack * wordSize);
 933   }
 934 
 935   // Ensure compiled code always sees stack at proper alignment
 936   __ andptr(rsp, -16);
 937 
 938   // push the return address and misalign the stack that youngest frame always sees
 939   // as far as the placement of the call instruction
 940   __ push(rax);
 941 
 942   // Put saved SP in another register
 943   const Register saved_sp = rax;
 944   __ movptr(saved_sp, r11);
 945 
 946   // Will jump to the compiled code just as if compiled code was doing it.
 947   // Pre-load the register-jump target early, to schedule it better.
 948   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 949 
 950 #if INCLUDE_JVMCI
 951   if (EnableJVMCI) {
 952     // check if this call should be routed towards a specific entry point
 953     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 954     Label no_alternative_target;
 955     __ jcc(Assembler::equal, no_alternative_target);
 956     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 957     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 958     __ bind(no_alternative_target);
 959   }
 960 #endif // INCLUDE_JVMCI
 961 
 962   // Now generate the shuffle code.  Pick up all register args and move the
 963   // rest through the floating point stack top.
 964   for (int i = 0; i < total_args_passed; i++) {
 965     if (sig_bt[i] == T_VOID) {
 966       // Longs and doubles are passed in native word order, but misaligned
 967       // in the 32-bit build.
 968       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 969       continue;
 970     }
 971 
 972     // Pick up 0, 1 or 2 words from SP+offset.
 973 
 974     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 975             "scrambled load targets?");
 976     // Load in argument order going down.
 977     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 978     // Point to interpreter value (vs. tag)
 979     int next_off = ld_off - Interpreter::stackElementSize;
 980     //
 981     //
 982     //
 983     VMReg r_1 = regs[i].first();
 984     VMReg r_2 = regs[i].second();
 985     if (!r_1->is_valid()) {
 986       assert(!r_2->is_valid(), "");
 987       continue;
 988     }
 989     if (r_1->is_stack()) {
 990       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 991       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 992 
 993       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 994       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 995       // will be generated.
 996       if (!r_2->is_valid()) {
 997         // sign extend???
 998         __ movl(r13, Address(saved_sp, ld_off));
 999         __ movptr(Address(rsp, st_off), r13);
1000       } else {
1001         //
1002         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1003         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1004         // So we must adjust where to pick up the data to match the interpreter.
1005         //
1006         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1007         // are accessed as negative so LSW is at LOW address
1008 
1009         // ld_off is MSW so get LSW
1010         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
1011                            next_off : ld_off;
1012         __ movq(r13, Address(saved_sp, offset));
1013         // st_off is LSW (i.e. reg.first())
1014         __ movq(Address(rsp, st_off), r13);
1015       }
1016     } else if (r_1->is_Register()) {  // Register argument
1017       Register r = r_1->as_Register();
1018       assert(r != rax, "must be different");
1019       if (r_2->is_valid()) {
1020         //
1021         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1022         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1023         // So we must adjust where to pick up the data to match the interpreter.
1024 
1025         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
1026                            next_off : ld_off;
1027 
1028         // this can be a misaligned move
1029         __ movq(r, Address(saved_sp, offset));
1030       } else {
1031         // sign extend and use a full word?
1032         __ movl(r, Address(saved_sp, ld_off));
1033       }
1034     } else {
1035       if (!r_2->is_valid()) {
1036         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1037       } else {
1038         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1039       }
1040     }
1041   }
1042 
1043   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1044 
1045   // 6243940 We might end up in handle_wrong_method if
1046   // the callee is deoptimized as we race thru here. If that
1047   // happens we don't want to take a safepoint because the
1048   // caller frame will look interpreted and arguments are now
1049   // "compiled" so it is much better to make this transition
1050   // invisible to the stack walking code. Unfortunately if
1051   // we try and find the callee by normal means a safepoint
1052   // is possible. So we stash the desired callee in the thread
1053   // and the vm will find there should this case occur.
1054 
1055   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1056 
1057   // put Method* where a c2i would expect should we end up there
1058   // only needed because eof c2 resolve stubs return Method* as a result in
1059   // rax
1060   __ mov(rax, rbx);
1061   __ jmp(r11);
1062 }
1063 
1064 // ---------------------------------------------------------------
1065 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1066                                                             int total_args_passed,
1067                                                             int comp_args_on_stack,
1068                                                             const BasicType *sig_bt,
1069                                                             const VMRegPair *regs,
1070                                                             AdapterFingerPrint* fingerprint) {
1071   address i2c_entry = __ pc();
1072 
1073   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1074 
1075   // -------------------------------------------------------------------------
1076   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1077   // to the interpreter.  The args start out packed in the compiled layout.  They
1078   // need to be unpacked into the interpreter layout.  This will almost always
1079   // require some stack space.  We grow the current (compiled) stack, then repack
1080   // the args.  We  finally end in a jump to the generic interpreter entry point.
1081   // On exit from the interpreter, the interpreter will restore our SP (lest the
1082   // compiled code, which relies solely on SP and not RBP, get sick).
1083 
1084   address c2i_unverified_entry = __ pc();
1085   Label skip_fixup;
1086 
1087   Register data = rax;
1088   Register receiver = j_rarg0;
1089   Register temp = rbx;
1090 
1091   {
1092     __ ic_check(1 /* end_alignment */);
1093     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1094     // Method might have been compiled since the call site was patched to
1095     // interpreted if that is the case treat it as a miss so we can get
1096     // the call site corrected.
1097     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1098     __ jcc(Assembler::equal, skip_fixup);
1099     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1100   }
1101 
1102   address c2i_entry = __ pc();
1103 
1104   // Class initialization barrier for static methods
1105   address c2i_no_clinit_check_entry = nullptr;
1106   if (VM_Version::supports_fast_class_init_checks()) {
1107     Label L_skip_barrier;
1108     Register method = rbx;
1109 
1110     { // Bypass the barrier for non-static methods
1111       Register flags = rscratch1;
1112       __ movl(flags, Address(method, Method::access_flags_offset()));
1113       __ testl(flags, JVM_ACC_STATIC);
1114       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1115     }
1116 
1117     Register klass = rscratch1;
1118     __ load_method_holder(klass, method);
1119     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1120 
1121     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1122 
1123     __ bind(L_skip_barrier);
1124     c2i_no_clinit_check_entry = __ pc();
1125   }
1126 
1127   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1128   bs->c2i_entry_barrier(masm);
1129 
1130   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1131 
1132   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1133 }
1134 
1135 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1136                                          VMRegPair *regs,
1137                                          int total_args_passed) {
1138 
1139 // We return the amount of VMRegImpl stack slots we need to reserve for all
1140 // the arguments NOT counting out_preserve_stack_slots.
1141 
1142 // NOTE: These arrays will have to change when c1 is ported
1143 #ifdef _WIN64
1144     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1145       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1146     };
1147     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1148       c_farg0, c_farg1, c_farg2, c_farg3
1149     };
1150 #else
1151     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1152       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1153     };
1154     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1155       c_farg0, c_farg1, c_farg2, c_farg3,
1156       c_farg4, c_farg5, c_farg6, c_farg7
1157     };
1158 #endif // _WIN64
1159 
1160 
1161     uint int_args = 0;
1162     uint fp_args = 0;
1163     uint stk_args = 0; // inc by 2 each time
1164 
1165     for (int i = 0; i < total_args_passed; i++) {
1166       switch (sig_bt[i]) {
1167       case T_BOOLEAN:
1168       case T_CHAR:
1169       case T_BYTE:
1170       case T_SHORT:
1171       case T_INT:
1172         if (int_args < Argument::n_int_register_parameters_c) {
1173           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1174 #ifdef _WIN64
1175           fp_args++;
1176           // Allocate slots for callee to stuff register args the stack.
1177           stk_args += 2;
1178 #endif
1179         } else {
1180           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1181           stk_args += 2;
1182         }
1183         break;
1184       case T_LONG:
1185         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1186         // fall through
1187       case T_OBJECT:
1188       case T_ARRAY:
1189       case T_ADDRESS:
1190       case T_METADATA:
1191         if (int_args < Argument::n_int_register_parameters_c) {
1192           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1193 #ifdef _WIN64
1194           fp_args++;
1195           stk_args += 2;
1196 #endif
1197         } else {
1198           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1199           stk_args += 2;
1200         }
1201         break;
1202       case T_FLOAT:
1203         if (fp_args < Argument::n_float_register_parameters_c) {
1204           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1205 #ifdef _WIN64
1206           int_args++;
1207           // Allocate slots for callee to stuff register args the stack.
1208           stk_args += 2;
1209 #endif
1210         } else {
1211           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1212           stk_args += 2;
1213         }
1214         break;
1215       case T_DOUBLE:
1216         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1217         if (fp_args < Argument::n_float_register_parameters_c) {
1218           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1219 #ifdef _WIN64
1220           int_args++;
1221           // Allocate slots for callee to stuff register args the stack.
1222           stk_args += 2;
1223 #endif
1224         } else {
1225           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1226           stk_args += 2;
1227         }
1228         break;
1229       case T_VOID: // Halves of longs and doubles
1230         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1231         regs[i].set_bad();
1232         break;
1233       default:
1234         ShouldNotReachHere();
1235         break;
1236       }
1237     }
1238 #ifdef _WIN64
1239   // windows abi requires that we always allocate enough stack space
1240   // for 4 64bit registers to be stored down.
1241   if (stk_args < 8) {
1242     stk_args = 8;
1243   }
1244 #endif // _WIN64
1245 
1246   return stk_args;
1247 }
1248 
1249 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1250                                              uint num_bits,
1251                                              uint total_args_passed) {
1252   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1253          "only certain vector sizes are supported for now");
1254 
1255   static const XMMRegister VEC_ArgReg[32] = {
1256      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1257      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1258     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1259     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1260   };
1261 
1262   uint stk_args = 0;
1263   uint fp_args = 0;
1264 
1265   for (uint i = 0; i < total_args_passed; i++) {
1266     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1267     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1268     regs[i].set_pair(vmreg->next(next_val), vmreg);
1269   }
1270 
1271   return stk_args;
1272 }
1273 
1274 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1275   // We always ignore the frame_slots arg and just use the space just below frame pointer
1276   // which by this time is free to use
1277   switch (ret_type) {
1278   case T_FLOAT:
1279     __ movflt(Address(rbp, -wordSize), xmm0);
1280     break;
1281   case T_DOUBLE:
1282     __ movdbl(Address(rbp, -wordSize), xmm0);
1283     break;
1284   case T_VOID:  break;
1285   default: {
1286     __ movptr(Address(rbp, -wordSize), rax);
1287     }
1288   }
1289 }
1290 
1291 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1292   // We always ignore the frame_slots arg and just use the space just below frame pointer
1293   // which by this time is free to use
1294   switch (ret_type) {
1295   case T_FLOAT:
1296     __ movflt(xmm0, Address(rbp, -wordSize));
1297     break;
1298   case T_DOUBLE:
1299     __ movdbl(xmm0, Address(rbp, -wordSize));
1300     break;
1301   case T_VOID:  break;
1302   default: {
1303     __ movptr(rax, Address(rbp, -wordSize));
1304     }
1305   }
1306 }
1307 
1308 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1309     for ( int i = first_arg ; i < arg_count ; i++ ) {
1310       if (args[i].first()->is_Register()) {
1311         __ push(args[i].first()->as_Register());
1312       } else if (args[i].first()->is_XMMRegister()) {
1313         __ subptr(rsp, 2*wordSize);
1314         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1315       }
1316     }
1317 }
1318 
1319 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1320     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1321       if (args[i].first()->is_Register()) {
1322         __ pop(args[i].first()->as_Register());
1323       } else if (args[i].first()->is_XMMRegister()) {
1324         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1325         __ addptr(rsp, 2*wordSize);
1326       }
1327     }
1328 }
1329 
1330 static void verify_oop_args(MacroAssembler* masm,
1331                             const methodHandle& method,
1332                             const BasicType* sig_bt,
1333                             const VMRegPair* regs) {
1334   Register temp_reg = rbx;  // not part of any compiled calling seq
1335   if (VerifyOops) {
1336     for (int i = 0; i < method->size_of_parameters(); i++) {
1337       if (is_reference_type(sig_bt[i])) {
1338         VMReg r = regs[i].first();
1339         assert(r->is_valid(), "bad oop arg");
1340         if (r->is_stack()) {
1341           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1342           __ verify_oop(temp_reg);
1343         } else {
1344           __ verify_oop(r->as_Register());
1345         }
1346       }
1347     }
1348   }
1349 }
1350 
1351 static void check_continuation_enter_argument(VMReg actual_vmreg,
1352                                               Register expected_reg,
1353                                               const char* name) {
1354   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1355   assert(actual_vmreg->as_Register() == expected_reg,
1356          "%s is in unexpected register: %s instead of %s",
1357          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1358 }
1359 
1360 
1361 //---------------------------- continuation_enter_setup ---------------------------
1362 //
1363 // Arguments:
1364 //   None.
1365 //
1366 // Results:
1367 //   rsp: pointer to blank ContinuationEntry
1368 //
1369 // Kills:
1370 //   rax
1371 //
1372 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1373   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1374   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1375   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1376 
1377   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1378   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1379 
1380   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1381   OopMap* map = new OopMap(frame_size, 0);
1382 
1383   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1384   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1385   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1386 
1387   return map;
1388 }
1389 
1390 //---------------------------- fill_continuation_entry ---------------------------
1391 //
1392 // Arguments:
1393 //   rsp: pointer to blank Continuation entry
1394 //   reg_cont_obj: pointer to the continuation
1395 //   reg_flags: flags
1396 //
1397 // Results:
1398 //   rsp: pointer to filled out ContinuationEntry
1399 //
1400 // Kills:
1401 //   rax
1402 //
1403 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1404   assert_different_registers(rax, reg_cont_obj, reg_flags);
1405 #ifdef ASSERT
1406   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1407 #endif
1408   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1409   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1410   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1411   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1412   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1413 
1414   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1415   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1416   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1417   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1418 
1419   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1420   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1421 }
1422 
1423 //---------------------------- continuation_enter_cleanup ---------------------------
1424 //
1425 // Arguments:
1426 //   rsp: pointer to the ContinuationEntry
1427 //
1428 // Results:
1429 //   rsp: pointer to the spilled rbp in the entry frame
1430 //
1431 // Kills:
1432 //   rbx
1433 //
1434 void static continuation_enter_cleanup(MacroAssembler* masm) {
1435 #ifdef ASSERT
1436   Label L_good_sp;
1437   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1438   __ jcc(Assembler::equal, L_good_sp);
1439   __ stop("Incorrect rsp at continuation_enter_cleanup");
1440   __ bind(L_good_sp);
1441 #endif
1442   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1443   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1444 
1445   if (CheckJNICalls) {
1446     // Check if this is a virtual thread continuation
1447     Label L_skip_vthread_code;
1448     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1449     __ jcc(Assembler::equal, L_skip_vthread_code);
1450 
1451     // If the held monitor count is > 0 and this vthread is terminating then
1452     // it failed to release a JNI monitor. So we issue the same log message
1453     // that JavaThread::exit does.
1454     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1455     __ jcc(Assembler::equal, L_skip_vthread_code);
1456 
1457     // rax may hold an exception oop, save it before the call
1458     __ push(rax);
1459     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1460     __ pop(rax);
1461 
1462     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1463     // on termination. The held count is implicitly zeroed below when we restore from
1464     // the parent held count (which has to be zero).
1465     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1466 
1467     __ bind(L_skip_vthread_code);
1468   }
1469 #ifdef ASSERT
1470   else {
1471     // Check if this is a virtual thread continuation
1472     Label L_skip_vthread_code;
1473     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1474     __ jcc(Assembler::equal, L_skip_vthread_code);
1475 
1476     // See comment just above. If not checking JNI calls the JNI count is only
1477     // needed for assertion checking.
1478     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1479 
1480     __ bind(L_skip_vthread_code);
1481   }
1482 #endif
1483 
1484   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1485   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1486 
1487   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1488   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1489   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1490 }
1491 
1492 static void gen_continuation_enter(MacroAssembler* masm,
1493                                    const VMRegPair* regs,
1494                                    int& exception_offset,
1495                                    OopMapSet* oop_maps,
1496                                    int& frame_complete,
1497                                    int& stack_slots,
1498                                    int& interpreted_entry_offset,
1499                                    int& compiled_entry_offset) {
1500 
1501   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1502   int pos_cont_obj   = 0;
1503   int pos_is_cont    = 1;
1504   int pos_is_virtual = 2;
1505 
1506   // The platform-specific calling convention may present the arguments in various registers.
1507   // To simplify the rest of the code, we expect the arguments to reside at these known
1508   // registers, and we additionally check the placement here in case calling convention ever
1509   // changes.
1510   Register reg_cont_obj   = c_rarg1;
1511   Register reg_is_cont    = c_rarg2;
1512   Register reg_is_virtual = c_rarg3;
1513 
1514   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1515   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1516   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1517 
1518   // Utility methods kill rax, make sure there are no collisions
1519   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1520 
1521   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1522                          relocInfo::static_call_type);
1523 
1524   address start = __ pc();
1525 
1526   Label L_thaw, L_exit;
1527 
1528   // i2i entry used at interp_only_mode only
1529   interpreted_entry_offset = __ pc() - start;
1530   {
1531 #ifdef ASSERT
1532     Label is_interp_only;
1533     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1534     __ jcc(Assembler::notEqual, is_interp_only);
1535     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1536     __ bind(is_interp_only);
1537 #endif
1538 
1539     __ pop(rax); // return address
1540     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1541     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1542     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1543     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1544     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1545     __ push(rax); // return address
1546     __ push_cont_fastpath();
1547 
1548     __ enter();
1549 
1550     stack_slots = 2; // will be adjusted in setup
1551     OopMap* map = continuation_enter_setup(masm, stack_slots);
1552     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1553     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1554 
1555     __ verify_oop(reg_cont_obj);
1556 
1557     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1558 
1559     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1560     __ testptr(reg_is_cont, reg_is_cont);
1561     __ jcc(Assembler::notZero, L_thaw);
1562 
1563     // --- Resolve path
1564 
1565     // Make sure the call is patchable
1566     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1567     // Emit stub for static call
1568     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1569     if (stub == nullptr) {
1570       fatal("CodeCache is full at gen_continuation_enter");
1571     }
1572     __ call(resolve);
1573     oop_maps->add_gc_map(__ pc() - start, map);
1574     __ post_call_nop();
1575 
1576     __ jmp(L_exit);
1577   }
1578 
1579   // compiled entry
1580   __ align(CodeEntryAlignment);
1581   compiled_entry_offset = __ pc() - start;
1582   __ enter();
1583 
1584   stack_slots = 2; // will be adjusted in setup
1585   OopMap* map = continuation_enter_setup(masm, stack_slots);
1586 
1587   // Frame is now completed as far as size and linkage.
1588   frame_complete = __ pc() - start;
1589 
1590   __ verify_oop(reg_cont_obj);
1591 
1592   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1593 
1594   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1595   __ testptr(reg_is_cont, reg_is_cont);
1596   __ jccb(Assembler::notZero, L_thaw);
1597 
1598   // --- call Continuation.enter(Continuation c, boolean isContinue)
1599 
1600   // Make sure the call is patchable
1601   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1602 
1603   // Emit stub for static call
1604   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1605   if (stub == nullptr) {
1606     fatal("CodeCache is full at gen_continuation_enter");
1607   }
1608 
1609   // The call needs to be resolved. There's a special case for this in
1610   // SharedRuntime::find_callee_info_helper() which calls
1611   // LinkResolver::resolve_continuation_enter() which resolves the call to
1612   // Continuation.enter(Continuation c, boolean isContinue).
1613   __ call(resolve);
1614 
1615   oop_maps->add_gc_map(__ pc() - start, map);
1616   __ post_call_nop();
1617 
1618   __ jmpb(L_exit);
1619 
1620   // --- Thawing path
1621 
1622   __ bind(L_thaw);
1623 
1624   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1625 
1626   ContinuationEntry::_return_pc_offset = __ pc() - start;
1627   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1628   __ post_call_nop();
1629 
1630   // --- Normal exit (resolve/thawing)
1631 
1632   __ bind(L_exit);
1633 
1634   continuation_enter_cleanup(masm);
1635   __ pop(rbp);
1636   __ ret(0);
1637 
1638   // --- Exception handling path
1639 
1640   exception_offset = __ pc() - start;
1641 
1642   continuation_enter_cleanup(masm);
1643   __ pop(rbp);
1644 
1645   __ movptr(c_rarg0, r15_thread);
1646   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1647 
1648   // rax still holds the original exception oop, save it before the call
1649   __ push(rax);
1650 
1651   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1652   __ movptr(rbx, rax);
1653 
1654   // Continue at exception handler:
1655   //   rax: exception oop
1656   //   rbx: exception handler
1657   //   rdx: exception pc
1658   __ pop(rax);
1659   __ verify_oop(rax);
1660   __ pop(rdx);
1661   __ jmp(rbx);
1662 }
1663 
1664 static void gen_continuation_yield(MacroAssembler* masm,
1665                                    const VMRegPair* regs,
1666                                    OopMapSet* oop_maps,
1667                                    int& frame_complete,
1668                                    int& stack_slots,
1669                                    int& compiled_entry_offset) {
1670   enum layout {
1671     rbp_off,
1672     rbpH_off,
1673     return_off,
1674     return_off2,
1675     framesize // inclusive of return address
1676   };
1677   stack_slots = framesize /  VMRegImpl::slots_per_word;
1678   assert(stack_slots == 2, "recheck layout");
1679 
1680   address start = __ pc();
1681   compiled_entry_offset = __ pc() - start;
1682   __ enter();
1683   address the_pc = __ pc();
1684 
1685   frame_complete = the_pc - start;
1686 
1687   // This nop must be exactly at the PC we push into the frame info.
1688   // We use this nop for fast CodeBlob lookup, associate the OopMap
1689   // with it right away.
1690   __ post_call_nop();
1691   OopMap* map = new OopMap(framesize, 1);
1692   oop_maps->add_gc_map(frame_complete, map);
1693 
1694   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1695   __ movptr(c_rarg0, r15_thread);
1696   __ movptr(c_rarg1, rsp);
1697   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1698   __ reset_last_Java_frame(true);
1699 
1700   Label L_pinned;
1701 
1702   __ testptr(rax, rax);
1703   __ jcc(Assembler::notZero, L_pinned);
1704 
1705   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1706   continuation_enter_cleanup(masm);
1707   __ pop(rbp);
1708   __ ret(0);
1709 
1710   __ bind(L_pinned);
1711 
1712   // Pinned, return to caller
1713 
1714   // handle pending exception thrown by freeze
1715   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1716   Label ok;
1717   __ jcc(Assembler::equal, ok);
1718   __ leave();
1719   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1720   __ bind(ok);
1721 
1722   __ leave();
1723   __ ret(0);
1724 }
1725 
1726 static void gen_special_dispatch(MacroAssembler* masm,
1727                                  const methodHandle& method,
1728                                  const BasicType* sig_bt,
1729                                  const VMRegPair* regs) {
1730   verify_oop_args(masm, method, sig_bt, regs);
1731   vmIntrinsics::ID iid = method->intrinsic_id();
1732 
1733   // Now write the args into the outgoing interpreter space
1734   bool     has_receiver   = false;
1735   Register receiver_reg   = noreg;
1736   int      member_arg_pos = -1;
1737   Register member_reg     = noreg;
1738   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1739   if (ref_kind != 0) {
1740     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1741     member_reg = rbx;  // known to be free at this point
1742     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1743   } else if (iid == vmIntrinsics::_invokeBasic) {
1744     has_receiver = true;
1745   } else if (iid == vmIntrinsics::_linkToNative) {
1746     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1747     member_reg = rbx;  // known to be free at this point
1748   } else {
1749     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1750   }
1751 
1752   if (member_reg != noreg) {
1753     // Load the member_arg into register, if necessary.
1754     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1755     VMReg r = regs[member_arg_pos].first();
1756     if (r->is_stack()) {
1757       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1758     } else {
1759       // no data motion is needed
1760       member_reg = r->as_Register();
1761     }
1762   }
1763 
1764   if (has_receiver) {
1765     // Make sure the receiver is loaded into a register.
1766     assert(method->size_of_parameters() > 0, "oob");
1767     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1768     VMReg r = regs[0].first();
1769     assert(r->is_valid(), "bad receiver arg");
1770     if (r->is_stack()) {
1771       // Porting note:  This assumes that compiled calling conventions always
1772       // pass the receiver oop in a register.  If this is not true on some
1773       // platform, pick a temp and load the receiver from stack.
1774       fatal("receiver always in a register");
1775       receiver_reg = j_rarg0;  // known to be free at this point
1776       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1777     } else {
1778       // no data motion is needed
1779       receiver_reg = r->as_Register();
1780     }
1781   }
1782 
1783   // Figure out which address we are really jumping to:
1784   MethodHandles::generate_method_handle_dispatch(masm, iid,
1785                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1786 }
1787 
1788 // ---------------------------------------------------------------------------
1789 // Generate a native wrapper for a given method.  The method takes arguments
1790 // in the Java compiled code convention, marshals them to the native
1791 // convention (handlizes oops, etc), transitions to native, makes the call,
1792 // returns to java state (possibly blocking), unhandlizes any result and
1793 // returns.
1794 //
1795 // Critical native functions are a shorthand for the use of
1796 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1797 // functions.  The wrapper is expected to unpack the arguments before
1798 // passing them to the callee. Critical native functions leave the state _in_Java,
1799 // since they cannot stop for GC.
1800 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1801 // block and the check for pending exceptions it's impossible for them
1802 // to be thrown.
1803 //
1804 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1805                                                 const methodHandle& method,
1806                                                 int compile_id,
1807                                                 BasicType* in_sig_bt,
1808                                                 VMRegPair* in_regs,
1809                                                 BasicType ret_type) {
1810   if (method->is_continuation_native_intrinsic()) {
1811     int exception_offset = -1;
1812     OopMapSet* oop_maps = new OopMapSet();
1813     int frame_complete = -1;
1814     int stack_slots = -1;
1815     int interpreted_entry_offset = -1;
1816     int vep_offset = -1;
1817     if (method->is_continuation_enter_intrinsic()) {
1818       gen_continuation_enter(masm,
1819                              in_regs,
1820                              exception_offset,
1821                              oop_maps,
1822                              frame_complete,
1823                              stack_slots,
1824                              interpreted_entry_offset,
1825                              vep_offset);
1826     } else if (method->is_continuation_yield_intrinsic()) {
1827       gen_continuation_yield(masm,
1828                              in_regs,
1829                              oop_maps,
1830                              frame_complete,
1831                              stack_slots,
1832                              vep_offset);
1833     } else {
1834       guarantee(false, "Unknown Continuation native intrinsic");
1835     }
1836 
1837 #ifdef ASSERT
1838     if (method->is_continuation_enter_intrinsic()) {
1839       assert(interpreted_entry_offset != -1, "Must be set");
1840       assert(exception_offset != -1,         "Must be set");
1841     } else {
1842       assert(interpreted_entry_offset == -1, "Must be unset");
1843       assert(exception_offset == -1,         "Must be unset");
1844     }
1845     assert(frame_complete != -1,    "Must be set");
1846     assert(stack_slots != -1,       "Must be set");
1847     assert(vep_offset != -1,        "Must be set");
1848 #endif
1849 
1850     __ flush();
1851     nmethod* nm = nmethod::new_native_nmethod(method,
1852                                               compile_id,
1853                                               masm->code(),
1854                                               vep_offset,
1855                                               frame_complete,
1856                                               stack_slots,
1857                                               in_ByteSize(-1),
1858                                               in_ByteSize(-1),
1859                                               oop_maps,
1860                                               exception_offset);
1861     if (nm == nullptr) return nm;
1862     if (method->is_continuation_enter_intrinsic()) {
1863       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1864     } else if (method->is_continuation_yield_intrinsic()) {
1865       _cont_doYield_stub = nm;
1866     }
1867     return nm;
1868   }
1869 
1870   if (method->is_method_handle_intrinsic()) {
1871     vmIntrinsics::ID iid = method->intrinsic_id();
1872     intptr_t start = (intptr_t)__ pc();
1873     int vep_offset = ((intptr_t)__ pc()) - start;
1874     gen_special_dispatch(masm,
1875                          method,
1876                          in_sig_bt,
1877                          in_regs);
1878     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1879     __ flush();
1880     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1881     return nmethod::new_native_nmethod(method,
1882                                        compile_id,
1883                                        masm->code(),
1884                                        vep_offset,
1885                                        frame_complete,
1886                                        stack_slots / VMRegImpl::slots_per_word,
1887                                        in_ByteSize(-1),
1888                                        in_ByteSize(-1),
1889                                        nullptr);
1890   }
1891   address native_func = method->native_function();
1892   assert(native_func != nullptr, "must have function");
1893 
1894   // An OopMap for lock (and class if static)
1895   OopMapSet *oop_maps = new OopMapSet();
1896   intptr_t start = (intptr_t)__ pc();
1897 
1898   // We have received a description of where all the java arg are located
1899   // on entry to the wrapper. We need to convert these args to where
1900   // the jni function will expect them. To figure out where they go
1901   // we convert the java signature to a C signature by inserting
1902   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1903 
1904   const int total_in_args = method->size_of_parameters();
1905   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1906 
1907   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1908   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1909   BasicType* in_elem_bt = nullptr;
1910 
1911   int argc = 0;
1912   out_sig_bt[argc++] = T_ADDRESS;
1913   if (method->is_static()) {
1914     out_sig_bt[argc++] = T_OBJECT;
1915   }
1916 
1917   for (int i = 0; i < total_in_args ; i++ ) {
1918     out_sig_bt[argc++] = in_sig_bt[i];
1919   }
1920 
1921   // Now figure out where the args must be stored and how much stack space
1922   // they require.
1923   int out_arg_slots;
1924   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1925 
1926   // Compute framesize for the wrapper.  We need to handlize all oops in
1927   // incoming registers
1928 
1929   // Calculate the total number of stack slots we will need.
1930 
1931   // First count the abi requirement plus all of the outgoing args
1932   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1933 
1934   // Now the space for the inbound oop handle area
1935   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1936 
1937   int oop_handle_offset = stack_slots;
1938   stack_slots += total_save_slots;
1939 
1940   // Now any space we need for handlizing a klass if static method
1941 
1942   int klass_slot_offset = 0;
1943   int klass_offset = -1;
1944   int lock_slot_offset = 0;
1945   bool is_static = false;
1946 
1947   if (method->is_static()) {
1948     klass_slot_offset = stack_slots;
1949     stack_slots += VMRegImpl::slots_per_word;
1950     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1951     is_static = true;
1952   }
1953 
1954   // Plus a lock if needed
1955 
1956   if (method->is_synchronized()) {
1957     lock_slot_offset = stack_slots;
1958     stack_slots += VMRegImpl::slots_per_word;
1959   }
1960 
1961   // Now a place (+2) to save return values or temp during shuffling
1962   // + 4 for return address (which we own) and saved rbp
1963   stack_slots += 6;
1964 
1965   // Ok The space we have allocated will look like:
1966   //
1967   //
1968   // FP-> |                     |
1969   //      |---------------------|
1970   //      | 2 slots for moves   |
1971   //      |---------------------|
1972   //      | lock box (if sync)  |
1973   //      |---------------------| <- lock_slot_offset
1974   //      | klass (if static)   |
1975   //      |---------------------| <- klass_slot_offset
1976   //      | oopHandle area      |
1977   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1978   //      | outbound memory     |
1979   //      | based arguments     |
1980   //      |                     |
1981   //      |---------------------|
1982   //      |                     |
1983   // SP-> | out_preserved_slots |
1984   //
1985   //
1986 
1987 
1988   // Now compute actual number of stack words we need rounding to make
1989   // stack properly aligned.
1990   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1991 
1992   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1993 
1994   // First thing make an ic check to see if we should even be here
1995 
1996   // We are free to use all registers as temps without saving them and
1997   // restoring them except rbp. rbp is the only callee save register
1998   // as far as the interpreter and the compiler(s) are concerned.
1999 
2000   const Register receiver = j_rarg0;
2001 
2002   Label exception_pending;
2003 
2004   assert_different_registers(receiver, rscratch1, rscratch2);
2005   __ verify_oop(receiver);
2006   __ ic_check(8 /* end_alignment */);
2007 
2008   int vep_offset = ((intptr_t)__ pc()) - start;
2009 
2010   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2011     Label L_skip_barrier;
2012     Register klass = r10;
2013     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2014     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2015 
2016     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2017 
2018     __ bind(L_skip_barrier);
2019   }
2020 
2021 #ifdef COMPILER1
2022   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2023   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2024     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2025   }
2026 #endif // COMPILER1
2027 
2028   // The instruction at the verified entry point must be 5 bytes or longer
2029   // because it can be patched on the fly by make_non_entrant. The stack bang
2030   // instruction fits that requirement.
2031 
2032   // Generate stack overflow check
2033   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2034 
2035   // Generate a new frame for the wrapper.
2036   __ enter();
2037   // -2 because return address is already present and so is saved rbp
2038   __ subptr(rsp, stack_size - 2*wordSize);
2039 
2040   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2041   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2042   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2043 
2044   // Frame is now completed as far as size and linkage.
2045   int frame_complete = ((intptr_t)__ pc()) - start;
2046 
2047 #ifdef ASSERT
2048   __ check_stack_alignment(rsp, "improperly aligned stack");
2049 #endif /* ASSERT */
2050 
2051 
2052   // We use r14 as the oop handle for the receiver/klass
2053   // It is callee save so it survives the call to native
2054 
2055   const Register oop_handle_reg = r14;
2056 
2057   //
2058   // We immediately shuffle the arguments so that any vm call we have to
2059   // make from here on out (sync slow path, jvmti, etc.) we will have
2060   // captured the oops from our caller and have a valid oopMap for
2061   // them.
2062 
2063   // -----------------
2064   // The Grand Shuffle
2065 
2066   // The Java calling convention is either equal (linux) or denser (win64) than the
2067   // c calling convention. However the because of the jni_env argument the c calling
2068   // convention always has at least one more (and two for static) arguments than Java.
2069   // Therefore if we move the args from java -> c backwards then we will never have
2070   // a register->register conflict and we don't have to build a dependency graph
2071   // and figure out how to break any cycles.
2072   //
2073 
2074   // Record esp-based slot for receiver on stack for non-static methods
2075   int receiver_offset = -1;
2076 
2077   // This is a trick. We double the stack slots so we can claim
2078   // the oops in the caller's frame. Since we are sure to have
2079   // more args than the caller doubling is enough to make
2080   // sure we can capture all the incoming oop args from the
2081   // caller.
2082   //
2083   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2084 
2085   // Mark location of rbp (someday)
2086   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2087 
2088   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2089   // All inbound args are referenced based on rbp and all outbound args via rsp.
2090 
2091 
2092 #ifdef ASSERT
2093   bool reg_destroyed[Register::number_of_registers];
2094   bool freg_destroyed[XMMRegister::number_of_registers];
2095   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2096     reg_destroyed[r] = false;
2097   }
2098   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2099     freg_destroyed[f] = false;
2100   }
2101 
2102 #endif /* ASSERT */
2103 
2104   // For JNI natives the incoming and outgoing registers are offset upwards.
2105   GrowableArray<int> arg_order(2 * total_in_args);
2106 
2107   VMRegPair tmp_vmreg;
2108   tmp_vmreg.set2(rbx->as_VMReg());
2109 
2110   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2111     arg_order.push(i);
2112     arg_order.push(c_arg);
2113   }
2114 
2115   int temploc = -1;
2116   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2117     int i = arg_order.at(ai);
2118     int c_arg = arg_order.at(ai + 1);
2119     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2120 #ifdef ASSERT
2121     if (in_regs[i].first()->is_Register()) {
2122       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2123     } else if (in_regs[i].first()->is_XMMRegister()) {
2124       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2125     }
2126     if (out_regs[c_arg].first()->is_Register()) {
2127       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2128     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2129       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2130     }
2131 #endif /* ASSERT */
2132     switch (in_sig_bt[i]) {
2133       case T_ARRAY:
2134       case T_OBJECT:
2135         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2136                     ((i == 0) && (!is_static)),
2137                     &receiver_offset);
2138         break;
2139       case T_VOID:
2140         break;
2141 
2142       case T_FLOAT:
2143         __ float_move(in_regs[i], out_regs[c_arg]);
2144           break;
2145 
2146       case T_DOUBLE:
2147         assert( i + 1 < total_in_args &&
2148                 in_sig_bt[i + 1] == T_VOID &&
2149                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2150         __ double_move(in_regs[i], out_regs[c_arg]);
2151         break;
2152 
2153       case T_LONG :
2154         __ long_move(in_regs[i], out_regs[c_arg]);
2155         break;
2156 
2157       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2158 
2159       default:
2160         __ move32_64(in_regs[i], out_regs[c_arg]);
2161     }
2162   }
2163 
2164   int c_arg;
2165 
2166   // Pre-load a static method's oop into r14.  Used both by locking code and
2167   // the normal JNI call code.
2168   // point c_arg at the first arg that is already loaded in case we
2169   // need to spill before we call out
2170   c_arg = total_c_args - total_in_args;
2171 
2172   if (method->is_static()) {
2173 
2174     //  load oop into a register
2175     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2176 
2177     // Now handlize the static class mirror it's known not-null.
2178     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2179     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2180 
2181     // Now get the handle
2182     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2183     // store the klass handle as second argument
2184     __ movptr(c_rarg1, oop_handle_reg);
2185     // and protect the arg if we must spill
2186     c_arg--;
2187   }
2188 
2189   // Change state to native (we save the return address in the thread, since it might not
2190   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2191   // points into the right code segment. It does not have to be the correct return pc.
2192   // We use the same pc/oopMap repeatedly when we call out
2193 
2194   intptr_t the_pc = (intptr_t) __ pc();
2195   oop_maps->add_gc_map(the_pc - start, map);
2196 
2197   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2198 
2199 
2200   // We have all of the arguments setup at this point. We must not touch any register
2201   // argument registers at this point (what if we save/restore them there are no oop?
2202 
2203   if (DTraceMethodProbes) {
2204     // protect the args we've loaded
2205     save_args(masm, total_c_args, c_arg, out_regs);
2206     __ mov_metadata(c_rarg1, method());
2207     __ call_VM_leaf(
2208       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2209       r15_thread, c_rarg1);
2210     restore_args(masm, total_c_args, c_arg, out_regs);
2211   }
2212 
2213   // RedefineClasses() tracing support for obsolete method entry
2214   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2215     // protect the args we've loaded
2216     save_args(masm, total_c_args, c_arg, out_regs);
2217     __ mov_metadata(c_rarg1, method());
2218     __ call_VM_leaf(
2219       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2220       r15_thread, c_rarg1);
2221     restore_args(masm, total_c_args, c_arg, out_regs);
2222   }
2223 
2224   // Lock a synchronized method
2225 
2226   // Register definitions used by locking and unlocking
2227 
2228   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2229   const Register obj_reg  = rbx;  // Will contain the oop
2230   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2231   const Register old_hdr  = r13;  // value of old header at unlock time
2232 
2233   Label slow_path_lock;
2234   Label lock_done;
2235 
2236   if (method->is_synchronized()) {
2237     Label count_mon;
2238 
2239     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2240 
2241     // Get the handle (the 2nd argument)
2242     __ mov(oop_handle_reg, c_rarg1);
2243 
2244     // Get address of the box
2245 
2246     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2247 
2248     // Load the oop from the handle
2249     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2250 
2251     if (LockingMode == LM_MONITOR) {
2252       __ jmp(slow_path_lock);
2253     } else if (LockingMode == LM_LEGACY) {
2254       // Load immediate 1 into swap_reg %rax
2255       __ movl(swap_reg, 1);
2256 
2257       // Load (object->mark() | 1) into swap_reg %rax
2258       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2259 
2260       // Save (object->mark() | 1) into BasicLock's displaced header
2261       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2262 
2263       // src -> dest iff dest == rax else rax <- dest
2264       __ lock();
2265       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2266       __ jcc(Assembler::equal, count_mon);
2267 
2268       // Hmm should this move to the slow path code area???
2269 
2270       // Test if the oopMark is an obvious stack pointer, i.e.,
2271       //  1) (mark & 3) == 0, and
2272       //  2) rsp <= mark < mark + os::pagesize()
2273       // These 3 tests can be done by evaluating the following
2274       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2275       // assuming both stack pointer and pagesize have their
2276       // least significant 2 bits clear.
2277       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2278 
2279       __ subptr(swap_reg, rsp);
2280       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2281 
2282       // Save the test result, for recursive case, the result is zero
2283       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2284       __ jcc(Assembler::notEqual, slow_path_lock);
2285     } else {
2286       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2287       __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2288     }
2289     __ bind(count_mon);
2290     __ inc_held_monitor_count();
2291 
2292     // Slow path will re-enter here
2293     __ bind(lock_done);
2294   }
2295 
2296   // Finally just about ready to make the JNI call
2297 
2298   // get JNIEnv* which is first argument to native
2299   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2300 
2301   // Now set thread in native
2302   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2303 
2304   __ call(RuntimeAddress(native_func));
2305 
2306   // Verify or restore cpu control state after JNI call
2307   __ restore_cpu_control_state_after_jni(rscratch1);
2308 
2309   // Unpack native results.
2310   switch (ret_type) {
2311   case T_BOOLEAN: __ c2bool(rax);            break;
2312   case T_CHAR   : __ movzwl(rax, rax);      break;
2313   case T_BYTE   : __ sign_extend_byte (rax); break;
2314   case T_SHORT  : __ sign_extend_short(rax); break;
2315   case T_INT    : /* nothing to do */        break;
2316   case T_DOUBLE :
2317   case T_FLOAT  :
2318     // Result is in xmm0 we'll save as needed
2319     break;
2320   case T_ARRAY:                 // Really a handle
2321   case T_OBJECT:                // Really a handle
2322       break; // can't de-handlize until after safepoint check
2323   case T_VOID: break;
2324   case T_LONG: break;
2325   default       : ShouldNotReachHere();
2326   }
2327 
2328   Label after_transition;
2329 
2330   // Switch thread to "native transition" state before reading the synchronization state.
2331   // This additional state is necessary because reading and testing the synchronization
2332   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2333   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2334   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2335   //     Thread A is resumed to finish this native method, but doesn't block here since it
2336   //     didn't see any synchronization is progress, and escapes.
2337   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2338 
2339   // Force this write out before the read below
2340   if (!UseSystemMemoryBarrier) {
2341     __ membar(Assembler::Membar_mask_bits(
2342               Assembler::LoadLoad | Assembler::LoadStore |
2343               Assembler::StoreLoad | Assembler::StoreStore));
2344   }
2345 
2346   // check for safepoint operation in progress and/or pending suspend requests
2347   {
2348     Label Continue;
2349     Label slow_path;
2350 
2351     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2352 
2353     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2354     __ jcc(Assembler::equal, Continue);
2355     __ bind(slow_path);
2356 
2357     // Don't use call_VM as it will see a possible pending exception and forward it
2358     // and never return here preventing us from clearing _last_native_pc down below.
2359     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2360     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2361     // by hand.
2362     //
2363     __ vzeroupper();
2364     save_native_result(masm, ret_type, stack_slots);
2365     __ mov(c_rarg0, r15_thread);
2366     __ mov(r12, rsp); // remember sp
2367     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2368     __ andptr(rsp, -16); // align stack as required by ABI
2369     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2370     __ mov(rsp, r12); // restore sp
2371     __ reinit_heapbase();
2372     // Restore any method result value
2373     restore_native_result(masm, ret_type, stack_slots);
2374     __ bind(Continue);
2375   }
2376 
2377   // change thread state
2378   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2379   __ bind(after_transition);
2380 
2381   Label reguard;
2382   Label reguard_done;
2383   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2384   __ jcc(Assembler::equal, reguard);
2385   __ bind(reguard_done);
2386 
2387   // native result if any is live
2388 
2389   // Unlock
2390   Label slow_path_unlock;
2391   Label unlock_done;
2392   if (method->is_synchronized()) {
2393 
2394     Label fast_done;
2395 
2396     // Get locked oop from the handle we passed to jni
2397     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2398 
2399     if (LockingMode == LM_LEGACY) {
2400       Label not_recur;
2401       // Simple recursive lock?
2402       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2403       __ jcc(Assembler::notEqual, not_recur);
2404       __ dec_held_monitor_count();
2405       __ jmpb(fast_done);
2406       __ bind(not_recur);
2407     }
2408 
2409     // Must save rax if it is live now because cmpxchg must use it
2410     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2411       save_native_result(masm, ret_type, stack_slots);
2412     }
2413 
2414     if (LockingMode == LM_MONITOR) {
2415       __ jmp(slow_path_unlock);
2416     } else if (LockingMode == LM_LEGACY) {
2417       // get address of the stack lock
2418       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2419       //  get old displaced header
2420       __ movptr(old_hdr, Address(rax, 0));
2421 
2422       // Atomic swap old header if oop still contains the stack lock
2423       __ lock();
2424       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2425       __ jcc(Assembler::notEqual, slow_path_unlock);
2426       __ dec_held_monitor_count();
2427     } else {
2428       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2429       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2430       __ dec_held_monitor_count();
2431     }
2432 
2433     // slow path re-enters here
2434     __ bind(unlock_done);
2435     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2436       restore_native_result(masm, ret_type, stack_slots);
2437     }
2438 
2439     __ bind(fast_done);
2440   }
2441   if (DTraceMethodProbes) {
2442     save_native_result(masm, ret_type, stack_slots);
2443     __ mov_metadata(c_rarg1, method());
2444     __ call_VM_leaf(
2445          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2446          r15_thread, c_rarg1);
2447     restore_native_result(masm, ret_type, stack_slots);
2448   }
2449 
2450   __ reset_last_Java_frame(false);
2451 
2452   // Unbox oop result, e.g. JNIHandles::resolve value.
2453   if (is_reference_type(ret_type)) {
2454     __ resolve_jobject(rax /* value */,
2455                        r15_thread /* thread */,
2456                        rcx /* tmp */);
2457   }
2458 
2459   if (CheckJNICalls) {
2460     // clear_pending_jni_exception_check
2461     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2462   }
2463 
2464   // reset handle block
2465   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2466   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2467 
2468   // pop our frame
2469 
2470   __ leave();
2471 
2472   // Any exception pending?
2473   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2474   __ jcc(Assembler::notEqual, exception_pending);
2475 
2476   // Return
2477 
2478   __ ret(0);
2479 
2480   // Unexpected paths are out of line and go here
2481 
2482   // forward the exception
2483   __ bind(exception_pending);
2484 
2485   // and forward the exception
2486   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2487 
2488   // Slow path locking & unlocking
2489   if (method->is_synchronized()) {
2490 
2491     // BEGIN Slow path lock
2492     __ bind(slow_path_lock);
2493 
2494     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2495     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2496 
2497     // protect the args we've loaded
2498     save_args(masm, total_c_args, c_arg, out_regs);
2499 
2500     __ mov(c_rarg0, obj_reg);
2501     __ mov(c_rarg1, lock_reg);
2502     __ mov(c_rarg2, r15_thread);
2503 
2504     // Not a leaf but we have last_Java_frame setup as we want
2505     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2506     restore_args(masm, total_c_args, c_arg, out_regs);
2507 
2508 #ifdef ASSERT
2509     { Label L;
2510     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2511     __ jcc(Assembler::equal, L);
2512     __ stop("no pending exception allowed on exit from monitorenter");
2513     __ bind(L);
2514     }
2515 #endif
2516     __ jmp(lock_done);
2517 
2518     // END Slow path lock
2519 
2520     // BEGIN Slow path unlock
2521     __ bind(slow_path_unlock);
2522 
2523     // If we haven't already saved the native result we must save it now as xmm registers
2524     // are still exposed.
2525     __ vzeroupper();
2526     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2527       save_native_result(masm, ret_type, stack_slots);
2528     }
2529 
2530     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2531 
2532     __ mov(c_rarg0, obj_reg);
2533     __ mov(c_rarg2, r15_thread);
2534     __ mov(r12, rsp); // remember sp
2535     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2536     __ andptr(rsp, -16); // align stack as required by ABI
2537 
2538     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2539     // NOTE that obj_reg == rbx currently
2540     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2541     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2542 
2543     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2544     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2545     __ mov(rsp, r12); // restore sp
2546     __ reinit_heapbase();
2547 #ifdef ASSERT
2548     {
2549       Label L;
2550       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2551       __ jcc(Assembler::equal, L);
2552       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2553       __ bind(L);
2554     }
2555 #endif /* ASSERT */
2556 
2557     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2558 
2559     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2560       restore_native_result(masm, ret_type, stack_slots);
2561     }
2562     __ jmp(unlock_done);
2563 
2564     // END Slow path unlock
2565 
2566   } // synchronized
2567 
2568   // SLOW PATH Reguard the stack if needed
2569 
2570   __ bind(reguard);
2571   __ vzeroupper();
2572   save_native_result(masm, ret_type, stack_slots);
2573   __ mov(r12, rsp); // remember sp
2574   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2575   __ andptr(rsp, -16); // align stack as required by ABI
2576   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2577   __ mov(rsp, r12); // restore sp
2578   __ reinit_heapbase();
2579   restore_native_result(masm, ret_type, stack_slots);
2580   // and continue
2581   __ jmp(reguard_done);
2582 
2583 
2584 
2585   __ flush();
2586 
2587   nmethod *nm = nmethod::new_native_nmethod(method,
2588                                             compile_id,
2589                                             masm->code(),
2590                                             vep_offset,
2591                                             frame_complete,
2592                                             stack_slots / VMRegImpl::slots_per_word,
2593                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2594                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2595                                             oop_maps);
2596 
2597   return nm;
2598 }
2599 
2600 // this function returns the adjust size (in number of words) to a c2i adapter
2601 // activation for use during deoptimization
2602 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2603   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2604 }
2605 
2606 
2607 uint SharedRuntime::out_preserve_stack_slots() {
2608   return 0;
2609 }
2610 
2611 
2612 // Number of stack slots between incoming argument block and the start of
2613 // a new frame.  The PROLOG must add this many slots to the stack.  The
2614 // EPILOG must remove this many slots.  amd64 needs two slots for
2615 // return address.
2616 uint SharedRuntime::in_preserve_stack_slots() {
2617   return 4 + 2 * VerifyStackAtCalls;
2618 }
2619 
2620 //------------------------------generate_deopt_blob----------------------------
2621 void SharedRuntime::generate_deopt_blob() {
2622   // Allocate space for the code
2623   ResourceMark rm;
2624   // Setup code generation tools
2625   int pad = 0;
2626   if (UseAVX > 2) {
2627     pad += 1024;
2628   }
2629   if (UseAPX) {
2630     pad += 1024;
2631   }
2632 #if INCLUDE_JVMCI
2633   if (EnableJVMCI) {
2634     pad += 512; // Increase the buffer size when compiling for JVMCI
2635   }
2636 #endif
2637   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2638   MacroAssembler* masm = new MacroAssembler(&buffer);
2639   int frame_size_in_words;
2640   OopMap* map = nullptr;
2641   OopMapSet *oop_maps = new OopMapSet();
2642 
2643   // -------------
2644   // This code enters when returning to a de-optimized nmethod.  A return
2645   // address has been pushed on the stack, and return values are in
2646   // registers.
2647   // If we are doing a normal deopt then we were called from the patched
2648   // nmethod from the point we returned to the nmethod. So the return
2649   // address on the stack is wrong by NativeCall::instruction_size
2650   // We will adjust the value so it looks like we have the original return
2651   // address on the stack (like when we eagerly deoptimized).
2652   // In the case of an exception pending when deoptimizing, we enter
2653   // with a return address on the stack that points after the call we patched
2654   // into the exception handler. We have the following register state from,
2655   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2656   //    rax: exception oop
2657   //    rbx: exception handler
2658   //    rdx: throwing pc
2659   // So in this case we simply jam rdx into the useless return address and
2660   // the stack looks just like we want.
2661   //
2662   // At this point we need to de-opt.  We save the argument return
2663   // registers.  We call the first C routine, fetch_unroll_info().  This
2664   // routine captures the return values and returns a structure which
2665   // describes the current frame size and the sizes of all replacement frames.
2666   // The current frame is compiled code and may contain many inlined
2667   // functions, each with their own JVM state.  We pop the current frame, then
2668   // push all the new frames.  Then we call the C routine unpack_frames() to
2669   // populate these frames.  Finally unpack_frames() returns us the new target
2670   // address.  Notice that callee-save registers are BLOWN here; they have
2671   // already been captured in the vframeArray at the time the return PC was
2672   // patched.
2673   address start = __ pc();
2674   Label cont;
2675 
2676   // Prolog for non exception case!
2677 
2678   // Save everything in sight.
2679   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2680 
2681   // Normal deoptimization.  Save exec mode for unpack_frames.
2682   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2683   __ jmp(cont);
2684 
2685   int reexecute_offset = __ pc() - start;
2686 #if INCLUDE_JVMCI && !defined(COMPILER1)
2687   if (EnableJVMCI && UseJVMCICompiler) {
2688     // JVMCI does not use this kind of deoptimization
2689     __ should_not_reach_here();
2690   }
2691 #endif
2692 
2693   // Reexecute case
2694   // return address is the pc describes what bci to do re-execute at
2695 
2696   // No need to update map as each call to save_live_registers will produce identical oopmap
2697   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2698 
2699   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2700   __ jmp(cont);
2701 
2702 #if INCLUDE_JVMCI
2703   Label after_fetch_unroll_info_call;
2704   int implicit_exception_uncommon_trap_offset = 0;
2705   int uncommon_trap_offset = 0;
2706 
2707   if (EnableJVMCI) {
2708     implicit_exception_uncommon_trap_offset = __ pc() - start;
2709 
2710     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2711     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2712 
2713     uncommon_trap_offset = __ pc() - start;
2714 
2715     // Save everything in sight.
2716     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2717     // fetch_unroll_info needs to call last_java_frame()
2718     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2719 
2720     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2721     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2722 
2723     __ movl(r14, Deoptimization::Unpack_reexecute);
2724     __ mov(c_rarg0, r15_thread);
2725     __ movl(c_rarg2, r14); // exec mode
2726     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2727     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2728 
2729     __ reset_last_Java_frame(false);
2730 
2731     __ jmp(after_fetch_unroll_info_call);
2732   } // EnableJVMCI
2733 #endif // INCLUDE_JVMCI
2734 
2735   int exception_offset = __ pc() - start;
2736 
2737   // Prolog for exception case
2738 
2739   // all registers are dead at this entry point, except for rax, and
2740   // rdx which contain the exception oop and exception pc
2741   // respectively.  Set them in TLS and fall thru to the
2742   // unpack_with_exception_in_tls entry point.
2743 
2744   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2745   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2746 
2747   int exception_in_tls_offset = __ pc() - start;
2748 
2749   // new implementation because exception oop is now passed in JavaThread
2750 
2751   // Prolog for exception case
2752   // All registers must be preserved because they might be used by LinearScan
2753   // Exceptiop oop and throwing PC are passed in JavaThread
2754   // tos: stack at point of call to method that threw the exception (i.e. only
2755   // args are on the stack, no return address)
2756 
2757   // make room on stack for the return address
2758   // It will be patched later with the throwing pc. The correct value is not
2759   // available now because loading it from memory would destroy registers.
2760   __ push(0);
2761 
2762   // Save everything in sight.
2763   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2764 
2765   // Now it is safe to overwrite any register
2766 
2767   // Deopt during an exception.  Save exec mode for unpack_frames.
2768   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2769 
2770   // load throwing pc from JavaThread and patch it as the return address
2771   // of the current frame. Then clear the field in JavaThread
2772 
2773   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2774   __ movptr(Address(rbp, wordSize), rdx);
2775   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2776 
2777 #ifdef ASSERT
2778   // verify that there is really an exception oop in JavaThread
2779   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2780   __ verify_oop(rax);
2781 
2782   // verify that there is no pending exception
2783   Label no_pending_exception;
2784   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2785   __ testptr(rax, rax);
2786   __ jcc(Assembler::zero, no_pending_exception);
2787   __ stop("must not have pending exception here");
2788   __ bind(no_pending_exception);
2789 #endif
2790 
2791   __ bind(cont);
2792 
2793   // Call C code.  Need thread and this frame, but NOT official VM entry
2794   // crud.  We cannot block on this call, no GC can happen.
2795   //
2796   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2797 
2798   // fetch_unroll_info needs to call last_java_frame().
2799 
2800   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2801 #ifdef ASSERT
2802   { Label L;
2803     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2804     __ jcc(Assembler::equal, L);
2805     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2806     __ bind(L);
2807   }
2808 #endif // ASSERT
2809   __ mov(c_rarg0, r15_thread);
2810   __ movl(c_rarg1, r14); // exec_mode
2811   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2812 
2813   // Need to have an oopmap that tells fetch_unroll_info where to
2814   // find any register it might need.
2815   oop_maps->add_gc_map(__ pc() - start, map);
2816 
2817   __ reset_last_Java_frame(false);
2818 
2819 #if INCLUDE_JVMCI
2820   if (EnableJVMCI) {
2821     __ bind(after_fetch_unroll_info_call);
2822   }
2823 #endif
2824 
2825   // Load UnrollBlock* into rdi
2826   __ mov(rdi, rax);
2827 
2828   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2829    Label noException;
2830   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2831   __ jcc(Assembler::notEqual, noException);
2832   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2833   // QQQ this is useless it was null above
2834   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2835   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2836   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2837 
2838   __ verify_oop(rax);
2839 
2840   // Overwrite the result registers with the exception results.
2841   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2842   // I think this is useless
2843   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2844 
2845   __ bind(noException);
2846 
2847   // Only register save data is on the stack.
2848   // Now restore the result registers.  Everything else is either dead
2849   // or captured in the vframeArray.
2850   RegisterSaver::restore_result_registers(masm);
2851 
2852   // All of the register save area has been popped of the stack. Only the
2853   // return address remains.
2854 
2855   // Pop all the frames we must move/replace.
2856   //
2857   // Frame picture (youngest to oldest)
2858   // 1: self-frame (no frame link)
2859   // 2: deopting frame  (no frame link)
2860   // 3: caller of deopting frame (could be compiled/interpreted).
2861   //
2862   // Note: by leaving the return address of self-frame on the stack
2863   // and using the size of frame 2 to adjust the stack
2864   // when we are done the return to frame 3 will still be on the stack.
2865 
2866   // Pop deoptimized frame
2867   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2868   __ addptr(rsp, rcx);
2869 
2870   // rsp should be pointing at the return address to the caller (3)
2871 
2872   // Pick up the initial fp we should save
2873   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2874   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2875 
2876 #ifdef ASSERT
2877   // Compilers generate code that bang the stack by as much as the
2878   // interpreter would need. So this stack banging should never
2879   // trigger a fault. Verify that it does not on non product builds.
2880   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2881   __ bang_stack_size(rbx, rcx);
2882 #endif
2883 
2884   // Load address of array of frame pcs into rcx
2885   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2886 
2887   // Trash the old pc
2888   __ addptr(rsp, wordSize);
2889 
2890   // Load address of array of frame sizes into rsi
2891   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2892 
2893   // Load counter into rdx
2894   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2895 
2896   // Now adjust the caller's stack to make up for the extra locals
2897   // but record the original sp so that we can save it in the skeletal interpreter
2898   // frame and the stack walking of interpreter_sender will get the unextended sp
2899   // value and not the "real" sp value.
2900 
2901   const Register sender_sp = r8;
2902 
2903   __ mov(sender_sp, rsp);
2904   __ movl(rbx, Address(rdi,
2905                        Deoptimization::UnrollBlock::
2906                        caller_adjustment_offset()));
2907   __ subptr(rsp, rbx);
2908 
2909   // Push interpreter frames in a loop
2910   Label loop;
2911   __ bind(loop);
2912   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2913   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2914   __ pushptr(Address(rcx, 0));          // Save return address
2915   __ enter();                           // Save old & set new ebp
2916   __ subptr(rsp, rbx);                  // Prolog
2917   // This value is corrected by layout_activation_impl
2918   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2919   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2920   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2921   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2922   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2923   __ decrementl(rdx);                   // Decrement counter
2924   __ jcc(Assembler::notZero, loop);
2925   __ pushptr(Address(rcx, 0));          // Save final return address
2926 
2927   // Re-push self-frame
2928   __ enter();                           // Save old & set new ebp
2929 
2930   // Allocate a full sized register save area.
2931   // Return address and rbp are in place, so we allocate two less words.
2932   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2933 
2934   // Restore frame locals after moving the frame
2935   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2936   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2937 
2938   // Call C code.  Need thread but NOT official VM entry
2939   // crud.  We cannot block on this call, no GC can happen.  Call should
2940   // restore return values to their stack-slots with the new SP.
2941   //
2942   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2943 
2944   // Use rbp because the frames look interpreted now
2945   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2946   // Don't need the precise return PC here, just precise enough to point into this code blob.
2947   address the_pc = __ pc();
2948   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2949 
2950   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2951   __ mov(c_rarg0, r15_thread);
2952   __ movl(c_rarg1, r14); // second arg: exec_mode
2953   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2954   // Revert SP alignment after call since we're going to do some SP relative addressing below
2955   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2956 
2957   // Set an oopmap for the call site
2958   // Use the same PC we used for the last java frame
2959   oop_maps->add_gc_map(the_pc - start,
2960                        new OopMap( frame_size_in_words, 0 ));
2961 
2962   // Clear fp AND pc
2963   __ reset_last_Java_frame(true);
2964 
2965   // Collect return values
2966   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2967   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2968   // I think this is useless (throwing pc?)
2969   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2970 
2971   // Pop self-frame.
2972   __ leave();                           // Epilog
2973 
2974   // Jump to interpreter
2975   __ ret(0);
2976 
2977   // Make sure all code is generated
2978   masm->flush();
2979 
2980   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2981   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2982 #if INCLUDE_JVMCI
2983   if (EnableJVMCI) {
2984     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2985     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2986   }
2987 #endif
2988 }
2989 
2990 #ifdef COMPILER2
2991 //------------------------------generate_uncommon_trap_blob--------------------
2992 void SharedRuntime::generate_uncommon_trap_blob() {
2993   // Allocate space for the code
2994   ResourceMark rm;
2995   // Setup code generation tools
2996   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2997   MacroAssembler* masm = new MacroAssembler(&buffer);
2998 
2999   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3000 
3001   address start = __ pc();
3002 
3003   // Push self-frame.  We get here with a return address on the
3004   // stack, so rsp is 8-byte aligned until we allocate our frame.
3005   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
3006 
3007   // No callee saved registers. rbp is assumed implicitly saved
3008   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3009 
3010   // compiler left unloaded_class_index in j_rarg0 move to where the
3011   // runtime expects it.
3012   __ movl(c_rarg1, j_rarg0);
3013 
3014   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3015 
3016   // Call C code.  Need thread but NOT official VM entry
3017   // crud.  We cannot block on this call, no GC can happen.  Call should
3018   // capture callee-saved registers as well as return values.
3019   // Thread is in rdi already.
3020   //
3021   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
3022 
3023   __ mov(c_rarg0, r15_thread);
3024   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
3025   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3026 
3027   // Set an oopmap for the call site
3028   OopMapSet* oop_maps = new OopMapSet();
3029   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
3030 
3031   // location of rbp is known implicitly by the frame sender code
3032 
3033   oop_maps->add_gc_map(__ pc() - start, map);
3034 
3035   __ reset_last_Java_frame(false);
3036 
3037   // Load UnrollBlock* into rdi
3038   __ mov(rdi, rax);
3039 
3040 #ifdef ASSERT
3041   { Label L;
3042     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
3043               Deoptimization::Unpack_uncommon_trap);
3044     __ jcc(Assembler::equal, L);
3045     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
3046     __ bind(L);
3047   }
3048 #endif
3049 
3050   // Pop all the frames we must move/replace.
3051   //
3052   // Frame picture (youngest to oldest)
3053   // 1: self-frame (no frame link)
3054   // 2: deopting frame  (no frame link)
3055   // 3: caller of deopting frame (could be compiled/interpreted).
3056 
3057   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
3058   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
3059 
3060   // Pop deoptimized frame (int)
3061   __ movl(rcx, Address(rdi,
3062                        Deoptimization::UnrollBlock::
3063                        size_of_deoptimized_frame_offset()));
3064   __ addptr(rsp, rcx);
3065 
3066   // rsp should be pointing at the return address to the caller (3)
3067 
3068   // Pick up the initial fp we should save
3069   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3070   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3071 
3072 #ifdef ASSERT
3073   // Compilers generate code that bang the stack by as much as the
3074   // interpreter would need. So this stack banging should never
3075   // trigger a fault. Verify that it does not on non product builds.
3076   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3077   __ bang_stack_size(rbx, rcx);
3078 #endif
3079 
3080   // Load address of array of frame pcs into rcx (address*)
3081   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3082 
3083   // Trash the return pc
3084   __ addptr(rsp, wordSize);
3085 
3086   // Load address of array of frame sizes into rsi (intptr_t*)
3087   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
3088 
3089   // Counter
3090   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
3091 
3092   // Now adjust the caller's stack to make up for the extra locals but
3093   // record the original sp so that we can save it in the skeletal
3094   // interpreter frame and the stack walking of interpreter_sender
3095   // will get the unextended sp value and not the "real" sp value.
3096 
3097   const Register sender_sp = r8;
3098 
3099   __ mov(sender_sp, rsp);
3100   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
3101   __ subptr(rsp, rbx);
3102 
3103   // Push interpreter frames in a loop
3104   Label loop;
3105   __ bind(loop);
3106   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3107   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3108   __ pushptr(Address(rcx, 0));     // Save return address
3109   __ enter();                      // Save old & set new rbp
3110   __ subptr(rsp, rbx);             // Prolog
3111   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3112             sender_sp);            // Make it walkable
3113   // This value is corrected by layout_activation_impl
3114   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3115   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3116   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3117   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3118   __ decrementl(rdx);              // Decrement counter
3119   __ jcc(Assembler::notZero, loop);
3120   __ pushptr(Address(rcx, 0));     // Save final return address
3121 
3122   // Re-push self-frame
3123   __ enter();                 // Save old & set new rbp
3124   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3125                               // Prolog
3126 
3127   // Use rbp because the frames look interpreted now
3128   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3129   // Don't need the precise return PC here, just precise enough to point into this code blob.
3130   address the_pc = __ pc();
3131   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3132 
3133   // Call C code.  Need thread but NOT official VM entry
3134   // crud.  We cannot block on this call, no GC can happen.  Call should
3135   // restore return values to their stack-slots with the new SP.
3136   // Thread is in rdi already.
3137   //
3138   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3139 
3140   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3141   __ mov(c_rarg0, r15_thread);
3142   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3143   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3144 
3145   // Set an oopmap for the call site
3146   // Use the same PC we used for the last java frame
3147   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3148 
3149   // Clear fp AND pc
3150   __ reset_last_Java_frame(true);
3151 
3152   // Pop self-frame.
3153   __ leave();                 // Epilog
3154 
3155   // Jump to interpreter
3156   __ ret(0);
3157 
3158   // Make sure all code is generated
3159   masm->flush();
3160 
3161   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3162                                                  SimpleRuntimeFrame::framesize >> 1);
3163 }
3164 #endif // COMPILER2
3165 
3166 //------------------------------generate_handler_blob------
3167 //
3168 // Generate a special Compile2Runtime blob that saves all registers,
3169 // and setup oopmap.
3170 //
3171 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3172   assert(StubRoutines::forward_exception_entry() != nullptr,
3173          "must be generated before");
3174 
3175   ResourceMark rm;
3176   OopMapSet *oop_maps = new OopMapSet();
3177   OopMap* map;
3178 
3179   // Allocate space for the code.  Setup code generation tools.
3180   CodeBuffer buffer("handler_blob", 2348, 1024);
3181   MacroAssembler* masm = new MacroAssembler(&buffer);
3182 
3183   address start   = __ pc();
3184   address call_pc = nullptr;
3185   int frame_size_in_words;
3186   bool cause_return = (poll_type == POLL_AT_RETURN);
3187   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3188 
3189   // Make room for return address (or push it again)
3190   if (!cause_return) {
3191     __ push(rbx);
3192   }
3193 
3194   // Save registers, fpu state, and flags
3195   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3196 
3197   // The following is basically a call_VM.  However, we need the precise
3198   // address of the call in order to generate an oopmap. Hence, we do all the
3199   // work ourselves.
3200 
3201   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3202 
3203   // The return address must always be correct so that frame constructor never
3204   // sees an invalid pc.
3205 
3206   if (!cause_return) {
3207     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3208     // Additionally, rbx is a callee saved register and we can look at it later to determine
3209     // if someone changed the return address for us!
3210     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3211     __ movptr(Address(rbp, wordSize), rbx);
3212   }
3213 
3214   // Do the call
3215   __ mov(c_rarg0, r15_thread);
3216   __ call(RuntimeAddress(call_ptr));
3217 
3218   // Set an oopmap for the call site.  This oopmap will map all
3219   // oop-registers and debug-info registers as callee-saved.  This
3220   // will allow deoptimization at this safepoint to find all possible
3221   // debug-info recordings, as well as let GC find all oops.
3222 
3223   oop_maps->add_gc_map( __ pc() - start, map);
3224 
3225   Label noException;
3226 
3227   __ reset_last_Java_frame(false);
3228 
3229   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3230   __ jcc(Assembler::equal, noException);
3231 
3232   // Exception pending
3233 
3234   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3235 
3236   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3237 
3238   // No exception case
3239   __ bind(noException);
3240 
3241   Label no_adjust;
3242 #ifdef ASSERT
3243   Label bail;
3244 #endif
3245   if (!cause_return) {
3246     Label no_prefix, not_special;
3247 
3248     // If our stashed return pc was modified by the runtime we avoid touching it
3249     __ cmpptr(rbx, Address(rbp, wordSize));
3250     __ jccb(Assembler::notEqual, no_adjust);
3251 
3252     // Skip over the poll instruction.
3253     // See NativeInstruction::is_safepoint_poll()
3254     // Possible encodings:
3255     //      85 00       test   %eax,(%rax)
3256     //      85 01       test   %eax,(%rcx)
3257     //      85 02       test   %eax,(%rdx)
3258     //      85 03       test   %eax,(%rbx)
3259     //      85 06       test   %eax,(%rsi)
3260     //      85 07       test   %eax,(%rdi)
3261     //
3262     //   41 85 00       test   %eax,(%r8)
3263     //   41 85 01       test   %eax,(%r9)
3264     //   41 85 02       test   %eax,(%r10)
3265     //   41 85 03       test   %eax,(%r11)
3266     //   41 85 06       test   %eax,(%r14)
3267     //   41 85 07       test   %eax,(%r15)
3268     //
3269     //      85 04 24    test   %eax,(%rsp)
3270     //   41 85 04 24    test   %eax,(%r12)
3271     //      85 45 00    test   %eax,0x0(%rbp)
3272     //   41 85 45 00    test   %eax,0x0(%r13)
3273 
3274     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3275     __ jcc(Assembler::notEqual, no_prefix);
3276     __ addptr(rbx, 1);
3277     __ bind(no_prefix);
3278 #ifdef ASSERT
3279     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3280 #endif
3281     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3282     // r12/rsp 0x04
3283     // r13/rbp 0x05
3284     __ movzbq(rcx, Address(rbx, 1));
3285     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3286     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3287     __ cmpptr(rcx, 1);
3288     __ jcc(Assembler::above, not_special);
3289     __ addptr(rbx, 1);
3290     __ bind(not_special);
3291 #ifdef ASSERT
3292     // Verify the correct encoding of the poll we're about to skip.
3293     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3294     __ jcc(Assembler::notEqual, bail);
3295     // Mask out the modrm bits
3296     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3297     // rax encodes to 0, so if the bits are nonzero it's incorrect
3298     __ jcc(Assembler::notZero, bail);
3299 #endif
3300     // Adjust return pc forward to step over the safepoint poll instruction
3301     __ addptr(rbx, 2);
3302     __ movptr(Address(rbp, wordSize), rbx);
3303   }
3304 
3305   __ bind(no_adjust);
3306   // Normal exit, restore registers and exit.
3307   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3308   __ ret(0);
3309 
3310 #ifdef ASSERT
3311   __ bind(bail);
3312   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3313 #endif
3314 
3315   // Make sure all code is generated
3316   masm->flush();
3317 
3318   // Fill-out other meta info
3319   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3320 }
3321 
3322 //
3323 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3324 //
3325 // Generate a stub that calls into vm to find out the proper destination
3326 // of a java call. All the argument registers are live at this point
3327 // but since this is generic code we don't know what they are and the caller
3328 // must do any gc of the args.
3329 //
3330 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3331   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3332 
3333   // allocate space for the code
3334   ResourceMark rm;
3335 
3336   CodeBuffer buffer(name, 1552, 512);
3337   MacroAssembler* masm = new MacroAssembler(&buffer);
3338 
3339   int frame_size_in_words;
3340 
3341   OopMapSet *oop_maps = new OopMapSet();
3342   OopMap* map = nullptr;
3343 
3344   int start = __ offset();
3345 
3346   // No need to save vector registers since they are caller-saved anyway.
3347   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3348 
3349   int frame_complete = __ offset();
3350 
3351   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3352 
3353   __ mov(c_rarg0, r15_thread);
3354 
3355   __ call(RuntimeAddress(destination));
3356 
3357 
3358   // Set an oopmap for the call site.
3359   // We need this not only for callee-saved registers, but also for volatile
3360   // registers that the compiler might be keeping live across a safepoint.
3361 
3362   oop_maps->add_gc_map( __ offset() - start, map);
3363 
3364   // rax contains the address we are going to jump to assuming no exception got installed
3365 
3366   // clear last_Java_sp
3367   __ reset_last_Java_frame(false);
3368   // check for pending exceptions
3369   Label pending;
3370   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3371   __ jcc(Assembler::notEqual, pending);
3372 
3373   // get the returned Method*
3374   __ get_vm_result_2(rbx, r15_thread);
3375   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3376 
3377   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3378 
3379   RegisterSaver::restore_live_registers(masm);
3380 
3381   // We are back to the original state on entry and ready to go.
3382 
3383   __ jmp(rax);
3384 
3385   // Pending exception after the safepoint
3386 
3387   __ bind(pending);
3388 
3389   RegisterSaver::restore_live_registers(masm);
3390 
3391   // exception pending => remove activation and forward to exception handler
3392 
3393   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3394 
3395   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3396   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3397 
3398   // -------------
3399   // make sure all code is generated
3400   masm->flush();
3401 
3402   // return the  blob
3403   // frame_size_words or bytes??
3404   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3405 }
3406 
3407 //------------------------------Montgomery multiplication------------------------
3408 //
3409 
3410 #ifndef _WINDOWS
3411 
3412 // Subtract 0:b from carry:a.  Return carry.
3413 static julong
3414 sub(julong a[], julong b[], julong carry, long len) {
3415   long long i = 0, cnt = len;
3416   julong tmp;
3417   asm volatile("clc; "
3418                "0: ; "
3419                "mov (%[b], %[i], 8), %[tmp]; "
3420                "sbb %[tmp], (%[a], %[i], 8); "
3421                "inc %[i]; dec %[cnt]; "
3422                "jne 0b; "
3423                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3424                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3425                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3426                : "memory");
3427   return tmp;
3428 }
3429 
3430 // Multiply (unsigned) Long A by Long B, accumulating the double-
3431 // length result into the accumulator formed of T0, T1, and T2.
3432 #define MACC(A, B, T0, T1, T2)                                  \
3433 do {                                                            \
3434   unsigned long hi, lo;                                         \
3435   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3436            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3437            : "r"(A), "a"(B) : "cc");                            \
3438  } while(0)
3439 
3440 // As above, but add twice the double-length result into the
3441 // accumulator.
3442 #define MACC2(A, B, T0, T1, T2)                                 \
3443 do {                                                            \
3444   unsigned long hi, lo;                                         \
3445   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3446            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3447            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3448            : "r"(A), "a"(B) : "cc");                            \
3449  } while(0)
3450 
3451 #else //_WINDOWS
3452 
3453 static julong
3454 sub(julong a[], julong b[], julong carry, long len) {
3455   long i;
3456   julong tmp;
3457   unsigned char c = 1;
3458   for (i = 0; i < len; i++) {
3459     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3460     a[i] = tmp;
3461   }
3462   c = _addcarry_u64(c, carry, ~0, &tmp);
3463   return tmp;
3464 }
3465 
3466 // Multiply (unsigned) Long A by Long B, accumulating the double-
3467 // length result into the accumulator formed of T0, T1, and T2.
3468 #define MACC(A, B, T0, T1, T2)                          \
3469 do {                                                    \
3470   julong hi, lo;                            \
3471   lo = _umul128(A, B, &hi);                             \
3472   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3473   c = _addcarry_u64(c, hi, T1, &T1);                    \
3474   _addcarry_u64(c, T2, 0, &T2);                         \
3475  } while(0)
3476 
3477 // As above, but add twice the double-length result into the
3478 // accumulator.
3479 #define MACC2(A, B, T0, T1, T2)                         \
3480 do {                                                    \
3481   julong hi, lo;                            \
3482   lo = _umul128(A, B, &hi);                             \
3483   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3484   c = _addcarry_u64(c, hi, T1, &T1);                    \
3485   _addcarry_u64(c, T2, 0, &T2);                         \
3486   c = _addcarry_u64(0, lo, T0, &T0);                    \
3487   c = _addcarry_u64(c, hi, T1, &T1);                    \
3488   _addcarry_u64(c, T2, 0, &T2);                         \
3489  } while(0)
3490 
3491 #endif //_WINDOWS
3492 
3493 // Fast Montgomery multiplication.  The derivation of the algorithm is
3494 // in  A Cryptographic Library for the Motorola DSP56000,
3495 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3496 
3497 static void NOINLINE
3498 montgomery_multiply(julong a[], julong b[], julong n[],
3499                     julong m[], julong inv, int len) {
3500   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3501   int i;
3502 
3503   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3504 
3505   for (i = 0; i < len; i++) {
3506     int j;
3507     for (j = 0; j < i; j++) {
3508       MACC(a[j], b[i-j], t0, t1, t2);
3509       MACC(m[j], n[i-j], t0, t1, t2);
3510     }
3511     MACC(a[i], b[0], t0, t1, t2);
3512     m[i] = t0 * inv;
3513     MACC(m[i], n[0], t0, t1, t2);
3514 
3515     assert(t0 == 0, "broken Montgomery multiply");
3516 
3517     t0 = t1; t1 = t2; t2 = 0;
3518   }
3519 
3520   for (i = len; i < 2*len; i++) {
3521     int j;
3522     for (j = i-len+1; j < len; j++) {
3523       MACC(a[j], b[i-j], t0, t1, t2);
3524       MACC(m[j], n[i-j], t0, t1, t2);
3525     }
3526     m[i-len] = t0;
3527     t0 = t1; t1 = t2; t2 = 0;
3528   }
3529 
3530   while (t0)
3531     t0 = sub(m, n, t0, len);
3532 }
3533 
3534 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3535 // multiplies so it should be up to 25% faster than Montgomery
3536 // multiplication.  However, its loop control is more complex and it
3537 // may actually run slower on some machines.
3538 
3539 static void NOINLINE
3540 montgomery_square(julong a[], julong n[],
3541                   julong m[], julong inv, int len) {
3542   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3543   int i;
3544 
3545   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3546 
3547   for (i = 0; i < len; i++) {
3548     int j;
3549     int end = (i+1)/2;
3550     for (j = 0; j < end; j++) {
3551       MACC2(a[j], a[i-j], t0, t1, t2);
3552       MACC(m[j], n[i-j], t0, t1, t2);
3553     }
3554     if ((i & 1) == 0) {
3555       MACC(a[j], a[j], t0, t1, t2);
3556     }
3557     for (; j < i; j++) {
3558       MACC(m[j], n[i-j], t0, t1, t2);
3559     }
3560     m[i] = t0 * inv;
3561     MACC(m[i], n[0], t0, t1, t2);
3562 
3563     assert(t0 == 0, "broken Montgomery square");
3564 
3565     t0 = t1; t1 = t2; t2 = 0;
3566   }
3567 
3568   for (i = len; i < 2*len; i++) {
3569     int start = i-len+1;
3570     int end = start + (len - start)/2;
3571     int j;
3572     for (j = start; j < end; j++) {
3573       MACC2(a[j], a[i-j], t0, t1, t2);
3574       MACC(m[j], n[i-j], t0, t1, t2);
3575     }
3576     if ((i & 1) == 0) {
3577       MACC(a[j], a[j], t0, t1, t2);
3578     }
3579     for (; j < len; j++) {
3580       MACC(m[j], n[i-j], t0, t1, t2);
3581     }
3582     m[i-len] = t0;
3583     t0 = t1; t1 = t2; t2 = 0;
3584   }
3585 
3586   while (t0)
3587     t0 = sub(m, n, t0, len);
3588 }
3589 
3590 // Swap words in a longword.
3591 static julong swap(julong x) {
3592   return (x << 32) | (x >> 32);
3593 }
3594 
3595 // Copy len longwords from s to d, word-swapping as we go.  The
3596 // destination array is reversed.
3597 static void reverse_words(julong *s, julong *d, int len) {
3598   d += len;
3599   while(len-- > 0) {
3600     d--;
3601     *d = swap(*s);
3602     s++;
3603   }
3604 }
3605 
3606 // The threshold at which squaring is advantageous was determined
3607 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3608 #define MONTGOMERY_SQUARING_THRESHOLD 64
3609 
3610 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3611                                         jint len, jlong inv,
3612                                         jint *m_ints) {
3613   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3614   int longwords = len/2;
3615 
3616   // Make very sure we don't use so much space that the stack might
3617   // overflow.  512 jints corresponds to an 16384-bit integer and
3618   // will use here a total of 8k bytes of stack space.
3619   int divisor = sizeof(julong) * 4;
3620   guarantee(longwords <= 8192 / divisor, "must be");
3621   int total_allocation = longwords * sizeof (julong) * 4;
3622   julong *scratch = (julong *)alloca(total_allocation);
3623 
3624   // Local scratch arrays
3625   julong
3626     *a = scratch + 0 * longwords,
3627     *b = scratch + 1 * longwords,
3628     *n = scratch + 2 * longwords,
3629     *m = scratch + 3 * longwords;
3630 
3631   reverse_words((julong *)a_ints, a, longwords);
3632   reverse_words((julong *)b_ints, b, longwords);
3633   reverse_words((julong *)n_ints, n, longwords);
3634 
3635   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3636 
3637   reverse_words(m, (julong *)m_ints, longwords);
3638 }
3639 
3640 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3641                                       jint len, jlong inv,
3642                                       jint *m_ints) {
3643   assert(len % 2 == 0, "array length in montgomery_square must be even");
3644   int longwords = len/2;
3645 
3646   // Make very sure we don't use so much space that the stack might
3647   // overflow.  512 jints corresponds to an 16384-bit integer and
3648   // will use here a total of 6k bytes of stack space.
3649   int divisor = sizeof(julong) * 3;
3650   guarantee(longwords <= (8192 / divisor), "must be");
3651   int total_allocation = longwords * sizeof (julong) * 3;
3652   julong *scratch = (julong *)alloca(total_allocation);
3653 
3654   // Local scratch arrays
3655   julong
3656     *a = scratch + 0 * longwords,
3657     *n = scratch + 1 * longwords,
3658     *m = scratch + 2 * longwords;
3659 
3660   reverse_words((julong *)a_ints, a, longwords);
3661   reverse_words((julong *)n_ints, n, longwords);
3662 
3663   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3664     ::montgomery_square(a, n, m, (julong)inv, longwords);
3665   } else {
3666     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3667   }
3668 
3669   reverse_words(m, (julong *)m_ints, longwords);
3670 }
3671 
3672 #ifdef COMPILER2
3673 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3674 //
3675 //------------------------------generate_exception_blob---------------------------
3676 // creates exception blob at the end
3677 // Using exception blob, this code is jumped from a compiled method.
3678 // (see emit_exception_handler in x86_64.ad file)
3679 //
3680 // Given an exception pc at a call we call into the runtime for the
3681 // handler in this method. This handler might merely restore state
3682 // (i.e. callee save registers) unwind the frame and jump to the
3683 // exception handler for the nmethod if there is no Java level handler
3684 // for the nmethod.
3685 //
3686 // This code is entered with a jmp.
3687 //
3688 // Arguments:
3689 //   rax: exception oop
3690 //   rdx: exception pc
3691 //
3692 // Results:
3693 //   rax: exception oop
3694 //   rdx: exception pc in caller or ???
3695 //   destination: exception handler of caller
3696 //
3697 // Note: the exception pc MUST be at a call (precise debug information)
3698 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3699 //
3700 
3701 void OptoRuntime::generate_exception_blob() {
3702   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3703   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3704   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3705 
3706   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3707 
3708   // Allocate space for the code
3709   ResourceMark rm;
3710   // Setup code generation tools
3711   CodeBuffer buffer("exception_blob", 2048, 1024);
3712   MacroAssembler* masm = new MacroAssembler(&buffer);
3713 
3714 
3715   address start = __ pc();
3716 
3717   // Exception pc is 'return address' for stack walker
3718   __ push(rdx);
3719   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3720 
3721   // Save callee-saved registers.  See x86_64.ad.
3722 
3723   // rbp is an implicitly saved callee saved register (i.e., the calling
3724   // convention will save/restore it in the prolog/epilog). Other than that
3725   // there are no callee save registers now that adapter frames are gone.
3726 
3727   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3728 
3729   // Store exception in Thread object. We cannot pass any arguments to the
3730   // handle_exception call, since we do not want to make any assumption
3731   // about the size of the frame where the exception happened in.
3732   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3733   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3734   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3735 
3736   // This call does all the hard work.  It checks if an exception handler
3737   // exists in the method.
3738   // If so, it returns the handler address.
3739   // If not, it prepares for stack-unwinding, restoring the callee-save
3740   // registers of the frame being removed.
3741   //
3742   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3743 
3744   // At a method handle call, the stack may not be properly aligned
3745   // when returning with an exception.
3746   address the_pc = __ pc();
3747   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3748   __ mov(c_rarg0, r15_thread);
3749   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3750   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3751 
3752   // Set an oopmap for the call site.  This oopmap will only be used if we
3753   // are unwinding the stack.  Hence, all locations will be dead.
3754   // Callee-saved registers will be the same as the frame above (i.e.,
3755   // handle_exception_stub), since they were restored when we got the
3756   // exception.
3757 
3758   OopMapSet* oop_maps = new OopMapSet();
3759 
3760   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3761 
3762   __ reset_last_Java_frame(false);
3763 
3764   // Restore callee-saved registers
3765 
3766   // rbp is an implicitly saved callee-saved register (i.e., the calling
3767   // convention will save restore it in prolog/epilog) Other than that
3768   // there are no callee save registers now that adapter frames are gone.
3769 
3770   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3771 
3772   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3773   __ pop(rdx);                  // No need for exception pc anymore
3774 
3775   // rax: exception handler
3776 
3777   // We have a handler in rax (could be deopt blob).
3778   __ mov(r8, rax);
3779 
3780   // Get the exception oop
3781   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3782   // Get the exception pc in case we are deoptimized
3783   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3784 #ifdef ASSERT
3785   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3786   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3787 #endif
3788   // Clear the exception oop so GC no longer processes it as a root.
3789   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3790 
3791   // rax: exception oop
3792   // r8:  exception handler
3793   // rdx: exception pc
3794   // Jump to handler
3795 
3796   __ jmp(r8);
3797 
3798   // Make sure all code is generated
3799   masm->flush();
3800 
3801   // Set exception blob
3802   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3803 }
3804 #endif // COMPILER2