1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/vframeArray.hpp"
  56 #include "runtime/vm_version.hpp"
  57 #include "utilities/align.hpp"
  58 #include "utilities/checkedCast.hpp"
  59 #include "utilities/formatBuffer.hpp"
  60 #include "vmreg_x86.inline.hpp"
  61 #ifdef COMPILER1
  62 #include "c1/c1_Runtime1.hpp"
  63 #endif
  64 #ifdef COMPILER2
  65 #include "opto/runtime.hpp"
  66 #endif
  67 #if INCLUDE_JVMCI
  68 #include "jvmci/jvmciJavaClasses.hpp"
  69 #endif
  70 
  71 #define __ masm->
  72 
  73 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  74 
  75 class SimpleRuntimeFrame {
  76 
  77   public:
  78 
  79   // Most of the runtime stubs have this simple frame layout.
  80   // This class exists to make the layout shared in one place.
  81   // Offsets are for compiler stack slots, which are jints.
  82   enum layout {
  83     // The frame sender code expects that rbp will be in the "natural" place and
  84     // will override any oopMap setting for it. We must therefore force the layout
  85     // so that it agrees with the frame sender code.
  86     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  87     rbp_off2,
  88     return_off, return_off2,
  89     framesize
  90   };
  91 };
  92 
  93 class RegisterSaver {
  94   // Capture info about frame layout.  Layout offsets are in jint
  95   // units because compiler frame slots are jints.
  96 #define XSAVE_AREA_BEGIN 160
  97 #define XSAVE_AREA_YMM_BEGIN 576
  98 #define XSAVE_AREA_EGPRS 960
  99 #define XSAVE_AREA_OPMASK_BEGIN 1088
 100 #define XSAVE_AREA_ZMM_BEGIN 1152
 101 #define XSAVE_AREA_UPPERBANK 1664
 102 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 103 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 104 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 105 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 107   enum layout {
 108     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
 109     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
 110     DEF_XMM_OFFS(0),
 111     DEF_XMM_OFFS(1),
 112     // 2..15 are implied in range usage
 113     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 114     DEF_YMM_OFFS(0),
 115     DEF_YMM_OFFS(1),
 116     // 2..15 are implied in range usage
 117     r31_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 118     r31H_off,
 119     r30_off, r30H_off,
 120     r29_off, r29H_off,
 121     r28_off, r28H_off,
 122     r27_off, r27H_off,
 123     r26_off, r26H_off,
 124     r25_off, r25H_off,
 125     r24_off, r24H_off,
 126     r23_off, r23H_off,
 127     r22_off, r22H_off,
 128     r21_off, r21H_off,
 129     r20_off, r20H_off,
 130     r19_off, r19H_off,
 131     r18_off, r18H_off,
 132     r17_off, r17H_off,
 133     r16_off, r16H_off,
 134     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 135     DEF_OPMASK_OFFS(0),
 136     DEF_OPMASK_OFFS(1),
 137     // 2..7 are implied in range usage
 138     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 139     DEF_ZMM_OFFS(0),
 140     DEF_ZMM_OFFS(1),
 141     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 142     DEF_ZMM_UPPER_OFFS(16),
 143     DEF_ZMM_UPPER_OFFS(17),
 144     // 18..31 are implied in range usage
 145     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 146     fpu_stateH_end,
 147     r15_off, r15H_off,
 148     r14_off, r14H_off,
 149     r13_off, r13H_off,
 150     r12_off, r12H_off,
 151     r11_off, r11H_off,
 152     r10_off, r10H_off,
 153     r9_off,  r9H_off,
 154     r8_off,  r8H_off,
 155     rdi_off, rdiH_off,
 156     rsi_off, rsiH_off,
 157     ignore_off, ignoreH_off,  // extra copy of rbp
 158     rsp_off, rspH_off,
 159     rbx_off, rbxH_off,
 160     rdx_off, rdxH_off,
 161     rcx_off, rcxH_off,
 162     rax_off, raxH_off,
 163     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 164     align_off, alignH_off,
 165     flags_off, flagsH_off,
 166     // The frame sender code expects that rbp will be in the "natural" place and
 167     // will override any oopMap setting for it. We must therefore force the layout
 168     // so that it agrees with the frame sender code.
 169     rbp_off, rbpH_off,        // copy of rbp we will restore
 170     return_off, returnH_off,  // slot for return address
 171     reg_save_size             // size in compiler stack slots
 172   };
 173 
 174  public:
 175   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 176   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 177 
 178   // Offsets into the register save area
 179   // Used by deoptimization when it is managing result register
 180   // values on its own
 181 
 182   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 183   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 184   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 185   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 186   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 187 
 188   // During deoptimization only the result registers need to be restored,
 189   // all the other values have already been extracted.
 190   static void restore_result_registers(MacroAssembler* masm);
 191 };
 192 
 193 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 194   int off = 0;
 195   int num_xmm_regs = XMMRegister::available_xmm_registers();
 196 #if COMPILER2_OR_JVMCI
 197   if (save_wide_vectors && UseAVX == 0) {
 198     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 199   }
 200   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 201 #else
 202   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 203 #endif
 204 
 205   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 206   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 207   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 208   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 209   // CodeBlob frame size is in words.
 210   int frame_size_in_words = frame_size_in_bytes / wordSize;
 211   *total_frame_words = frame_size_in_words;
 212 
 213   // Save registers, fpu state, and flags.
 214   // We assume caller has already pushed the return address onto the
 215   // stack, so rsp is 8-byte aligned here.
 216   // We push rpb twice in this sequence because we want the real rbp
 217   // to be under the return like a normal enter.
 218 
 219   __ enter();          // rsp becomes 16-byte aligned here
 220   __ pushf();
 221   // Make sure rsp stays 16-byte aligned
 222   __ subq(rsp, 8);
 223   // Push CPU state in multiple of 16 bytes
 224   __ save_legacy_gprs();
 225   __ push_FPU_state();
 226 
 227 
 228   // push cpu state handles this on EVEX enabled targets
 229   if (save_wide_vectors) {
 230     // Save upper half of YMM registers(0..15)
 231     int base_addr = XSAVE_AREA_YMM_BEGIN;
 232     for (int n = 0; n < 16; n++) {
 233       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 234     }
 235     if (VM_Version::supports_evex()) {
 236       // Save upper half of ZMM registers(0..15)
 237       base_addr = XSAVE_AREA_ZMM_BEGIN;
 238       for (int n = 0; n < 16; n++) {
 239         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 240       }
 241       // Save full ZMM registers(16..num_xmm_regs)
 242       base_addr = XSAVE_AREA_UPPERBANK;
 243       off = 0;
 244       int vector_len = Assembler::AVX_512bit;
 245       for (int n = 16; n < num_xmm_regs; n++) {
 246         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 247       }
 248 #if COMPILER2_OR_JVMCI
 249       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 250       off = 0;
 251       for(int n = 0; n < KRegister::number_of_registers; n++) {
 252         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 253       }
 254 #endif
 255     }
 256   } else {
 257     if (VM_Version::supports_evex()) {
 258       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 259       int base_addr = XSAVE_AREA_UPPERBANK;
 260       off = 0;
 261       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 262       for (int n = 16; n < num_xmm_regs; n++) {
 263         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 264       }
 265 #if COMPILER2_OR_JVMCI
 266       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 267       off = 0;
 268       for(int n = 0; n < KRegister::number_of_registers; n++) {
 269         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 270       }
 271 #endif
 272     }
 273   }
 274 
 275 #if COMPILER2_OR_JVMCI
 276   if (UseAPX) {
 277       int base_addr = XSAVE_AREA_EGPRS;
 278       off = 0;
 279       for(int n = 16; n < Register::number_of_registers; n++) {
 280         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 281       }
 282   }
 283 #endif
 284 
 285   __ vzeroupper();
 286   if (frame::arg_reg_save_area_bytes != 0) {
 287     // Allocate argument register save area
 288     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 289   }
 290 
 291   // Set an oopmap for the call site.  This oopmap will map all
 292   // oop-registers and debug-info registers as callee-saved.  This
 293   // will allow deoptimization at this safepoint to find all possible
 294   // debug-info recordings, as well as let GC find all oops.
 295 
 296   OopMapSet *oop_maps = new OopMapSet();
 297   OopMap* map = new OopMap(frame_size_in_slots, 0);
 298 
 299 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 300 
 301   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 305   // rbp location is known implicitly by the frame sender code, needs no oopmap
 306   // and the location where rbp was saved by is ignored
 307   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 308   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 309   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 310   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 311   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 312   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 313   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 314   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 315   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 316   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 317 
 318   if (UseAPX) {
 319     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 324     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 325     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 326     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 327     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 328     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 329     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 330     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 331     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 332     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 333     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 334     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 335   }
 336   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 337   // on EVEX enabled targets, we get it included in the xsave area
 338   off = xmm0_off;
 339   int delta = xmm1_off - off;
 340   for (int n = 0; n < 16; n++) {
 341     XMMRegister xmm_name = as_XMMRegister(n);
 342     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 343     off += delta;
 344   }
 345   if (UseAVX > 2) {
 346     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 347     off = zmm16_off;
 348     delta = zmm17_off - off;
 349     for (int n = 16; n < num_xmm_regs; n++) {
 350       XMMRegister zmm_name = as_XMMRegister(n);
 351       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 352       off += delta;
 353     }
 354   }
 355 
 356 #if COMPILER2_OR_JVMCI
 357   if (save_wide_vectors) {
 358     // Save upper half of YMM registers(0..15)
 359     off = ymm0_off;
 360     delta = ymm1_off - ymm0_off;
 361     for (int n = 0; n < 16; n++) {
 362       XMMRegister ymm_name = as_XMMRegister(n);
 363       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 364       off += delta;
 365     }
 366     if (VM_Version::supports_evex()) {
 367       // Save upper half of ZMM registers(0..15)
 368       off = zmm0_off;
 369       delta = zmm1_off - zmm0_off;
 370       for (int n = 0; n < 16; n++) {
 371         XMMRegister zmm_name = as_XMMRegister(n);
 372         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 373         off += delta;
 374       }
 375     }
 376   }
 377 #endif // COMPILER2_OR_JVMCI
 378 
 379   // %%% These should all be a waste but we'll keep things as they were for now
 380   if (true) {
 381     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 385     // rbp location is known implicitly by the frame sender code, needs no oopmap
 386     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 387     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 388     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 389     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 390     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 391     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 392     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 393     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 394     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 395     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 396     if (UseAPX) {
 397       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 402       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 403       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 404       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 405       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 406       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 407       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 408       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 409       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 410       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 411       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 412       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 413     }
 414     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 415     // on EVEX enabled targets, we get it included in the xsave area
 416     off = xmm0H_off;
 417     delta = xmm1H_off - off;
 418     for (int n = 0; n < 16; n++) {
 419       XMMRegister xmm_name = as_XMMRegister(n);
 420       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 421       off += delta;
 422     }
 423     if (UseAVX > 2) {
 424       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 425       off = zmm16H_off;
 426       delta = zmm17H_off - off;
 427       for (int n = 16; n < num_xmm_regs; n++) {
 428         XMMRegister zmm_name = as_XMMRegister(n);
 429         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 430         off += delta;
 431       }
 432     }
 433   }
 434 
 435   return map;
 436 }
 437 
 438 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 439   int num_xmm_regs = XMMRegister::available_xmm_registers();
 440   if (frame::arg_reg_save_area_bytes != 0) {
 441     // Pop arg register save area
 442     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 443   }
 444 
 445 #if COMPILER2_OR_JVMCI
 446   if (restore_wide_vectors) {
 447     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 448     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 449   }
 450 #else
 451   assert(!restore_wide_vectors, "vectors are generated only by C2");
 452 #endif
 453 
 454   __ vzeroupper();
 455 
 456   // On EVEX enabled targets everything is handled in pop fpu state
 457   if (restore_wide_vectors) {
 458     // Restore upper half of YMM registers (0..15)
 459     int base_addr = XSAVE_AREA_YMM_BEGIN;
 460     for (int n = 0; n < 16; n++) {
 461       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 462     }
 463     if (VM_Version::supports_evex()) {
 464       // Restore upper half of ZMM registers (0..15)
 465       base_addr = XSAVE_AREA_ZMM_BEGIN;
 466       for (int n = 0; n < 16; n++) {
 467         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 468       }
 469       // Restore full ZMM registers(16..num_xmm_regs)
 470       base_addr = XSAVE_AREA_UPPERBANK;
 471       int vector_len = Assembler::AVX_512bit;
 472       int off = 0;
 473       for (int n = 16; n < num_xmm_regs; n++) {
 474         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 475       }
 476 #if COMPILER2_OR_JVMCI
 477       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 478       off = 0;
 479       for (int n = 0; n < KRegister::number_of_registers; n++) {
 480         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 481       }
 482 #endif
 483     }
 484   } else {
 485     if (VM_Version::supports_evex()) {
 486       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 487       int base_addr = XSAVE_AREA_UPPERBANK;
 488       int off = 0;
 489       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 490       for (int n = 16; n < num_xmm_regs; n++) {
 491         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 492       }
 493 #if COMPILER2_OR_JVMCI
 494       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 495       off = 0;
 496       for (int n = 0; n < KRegister::number_of_registers; n++) {
 497         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 498       }
 499 #endif
 500     }
 501   }
 502 
 503 #if COMPILER2_OR_JVMCI
 504   if (UseAPX) {
 505     int base_addr = XSAVE_AREA_EGPRS;
 506     int off = 0;
 507     for (int n = 16; n < Register::number_of_registers; n++) {
 508       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 509     }
 510   }
 511 #endif
 512 
 513   // Recover CPU state
 514   __ pop_FPU_state();
 515   __ restore_legacy_gprs();
 516   __ addq(rsp, 8);
 517   __ popf();
 518   // Get the rbp described implicitly by the calling convention (no oopMap)
 519   __ pop(rbp);
 520 }
 521 
 522 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 523 
 524   // Just restore result register. Only used by deoptimization. By
 525   // now any callee save register that needs to be restored to a c2
 526   // caller of the deoptee has been extracted into the vframeArray
 527   // and will be stuffed into the c2i adapter we create for later
 528   // restoration so only result registers need to be restored here.
 529 
 530   // Restore fp result register
 531   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 532   // Restore integer result register
 533   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 534   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 535 
 536   // Pop all of the register save are off the stack except the return address
 537   __ addptr(rsp, return_offset_in_bytes());
 538 }
 539 
 540 // Is vector's size (in bytes) bigger than a size saved by default?
 541 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 542 bool SharedRuntime::is_wide_vector(int size) {
 543   return size > 16;
 544 }
 545 
 546 // ---------------------------------------------------------------------------
 547 // Read the array of BasicTypes from a signature, and compute where the
 548 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 549 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 550 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 551 // as framesizes are fixed.
 552 // VMRegImpl::stack0 refers to the first slot 0(sp).
 553 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 554 // Register up to Register::number_of_registers are the 64-bit
 555 // integer registers.
 556 
 557 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 558 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 559 // units regardless of build. Of course for i486 there is no 64 bit build
 560 
 561 // The Java calling convention is a "shifted" version of the C ABI.
 562 // By skipping the first C ABI register we can call non-static jni methods
 563 // with small numbers of arguments without having to shuffle the arguments
 564 // at all. Since we control the java ABI we ought to at least get some
 565 // advantage out of it.
 566 
 567 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 568                                            VMRegPair *regs,
 569                                            int total_args_passed) {
 570 
 571   // Create the mapping between argument positions and
 572   // registers.
 573   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 574     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 575   };
 576   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 577     j_farg0, j_farg1, j_farg2, j_farg3,
 578     j_farg4, j_farg5, j_farg6, j_farg7
 579   };
 580 
 581 
 582   uint int_args = 0;
 583   uint fp_args = 0;
 584   uint stk_args = 0;
 585 
 586   for (int i = 0; i < total_args_passed; i++) {
 587     switch (sig_bt[i]) {
 588     case T_BOOLEAN:
 589     case T_CHAR:
 590     case T_BYTE:
 591     case T_SHORT:
 592     case T_INT:
 593       if (int_args < Argument::n_int_register_parameters_j) {
 594         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 595       } else {
 596         stk_args = align_up(stk_args, 2);
 597         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 598         stk_args += 1;
 599       }
 600       break;
 601     case T_VOID:
 602       // halves of T_LONG or T_DOUBLE
 603       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 604       regs[i].set_bad();
 605       break;
 606     case T_LONG:
 607       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 608       // fall through
 609     case T_OBJECT:
 610     case T_ARRAY:
 611     case T_ADDRESS:
 612       if (int_args < Argument::n_int_register_parameters_j) {
 613         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 614       } else {
 615         stk_args = align_up(stk_args, 2);
 616         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 617         stk_args += 2;
 618       }
 619       break;
 620     case T_FLOAT:
 621       if (fp_args < Argument::n_float_register_parameters_j) {
 622         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 623       } else {
 624         stk_args = align_up(stk_args, 2);
 625         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 626         stk_args += 1;
 627       }
 628       break;
 629     case T_DOUBLE:
 630       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 631       if (fp_args < Argument::n_float_register_parameters_j) {
 632         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 633       } else {
 634         stk_args = align_up(stk_args, 2);
 635         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 636         stk_args += 2;
 637       }
 638       break;
 639     default:
 640       ShouldNotReachHere();
 641       break;
 642     }
 643   }
 644 
 645   return stk_args;
 646 }
 647 
 648 // Patch the callers callsite with entry to compiled code if it exists.
 649 static void patch_callers_callsite(MacroAssembler *masm) {
 650   Label L;
 651   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 652   __ jcc(Assembler::equal, L);
 653 
 654   // Save the current stack pointer
 655   __ mov(r13, rsp);
 656   // Schedule the branch target address early.
 657   // Call into the VM to patch the caller, then jump to compiled callee
 658   // rax isn't live so capture return address while we easily can
 659   __ movptr(rax, Address(rsp, 0));
 660 
 661   // align stack so push_CPU_state doesn't fault
 662   __ andptr(rsp, -(StackAlignmentInBytes));
 663   __ push_CPU_state();
 664   __ vzeroupper();
 665   // VM needs caller's callsite
 666   // VM needs target method
 667   // This needs to be a long call since we will relocate this adapter to
 668   // the codeBuffer and it may not reach
 669 
 670   // Allocate argument register save area
 671   if (frame::arg_reg_save_area_bytes != 0) {
 672     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 673   }
 674   __ mov(c_rarg0, rbx);
 675   __ mov(c_rarg1, rax);
 676   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 677 
 678   // De-allocate argument register save area
 679   if (frame::arg_reg_save_area_bytes != 0) {
 680     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 681   }
 682 
 683   __ vzeroupper();
 684   __ pop_CPU_state();
 685   // restore sp
 686   __ mov(rsp, r13);
 687   __ bind(L);
 688 }
 689 
 690 
 691 static void gen_c2i_adapter(MacroAssembler *masm,
 692                             int total_args_passed,
 693                             int comp_args_on_stack,
 694                             const BasicType *sig_bt,
 695                             const VMRegPair *regs,
 696                             Label& skip_fixup) {
 697   // Before we get into the guts of the C2I adapter, see if we should be here
 698   // at all.  We've come from compiled code and are attempting to jump to the
 699   // interpreter, which means the caller made a static call to get here
 700   // (vcalls always get a compiled target if there is one).  Check for a
 701   // compiled target.  If there is one, we need to patch the caller's call.
 702   patch_callers_callsite(masm);
 703 
 704   __ bind(skip_fixup);
 705 
 706   // Since all args are passed on the stack, total_args_passed *
 707   // Interpreter::stackElementSize is the space we need.
 708 
 709   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 710 
 711   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 712 
 713   // stack is aligned, keep it that way
 714   // This is not currently needed or enforced by the interpreter, but
 715   // we might as well conform to the ABI.
 716   extraspace = align_up(extraspace, 2*wordSize);
 717 
 718   // set senderSP value
 719   __ lea(r13, Address(rsp, wordSize));
 720 
 721 #ifdef ASSERT
 722   __ check_stack_alignment(r13, "sender stack not aligned");
 723 #endif
 724   if (extraspace > 0) {
 725     // Pop the return address
 726     __ pop(rax);
 727 
 728     __ subptr(rsp, extraspace);
 729 
 730     // Push the return address
 731     __ push(rax);
 732 
 733     // Account for the return address location since we store it first rather
 734     // than hold it in a register across all the shuffling
 735     extraspace += wordSize;
 736   }
 737 
 738 #ifdef ASSERT
 739   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 740 #endif
 741 
 742   // Now write the args into the outgoing interpreter space
 743   for (int i = 0; i < total_args_passed; i++) {
 744     if (sig_bt[i] == T_VOID) {
 745       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 746       continue;
 747     }
 748 
 749     // offset to start parameters
 750     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 751     int next_off = st_off - Interpreter::stackElementSize;
 752 
 753     // Say 4 args:
 754     // i   st_off
 755     // 0   32 T_LONG
 756     // 1   24 T_VOID
 757     // 2   16 T_OBJECT
 758     // 3    8 T_BOOL
 759     // -    0 return address
 760     //
 761     // However to make thing extra confusing. Because we can fit a long/double in
 762     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 763     // leaves one slot empty and only stores to a single slot. In this case the
 764     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 765 
 766     VMReg r_1 = regs[i].first();
 767     VMReg r_2 = regs[i].second();
 768     if (!r_1->is_valid()) {
 769       assert(!r_2->is_valid(), "");
 770       continue;
 771     }
 772     if (r_1->is_stack()) {
 773       // memory to memory use rax
 774       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 775       if (!r_2->is_valid()) {
 776         // sign extend??
 777         __ movl(rax, Address(rsp, ld_off));
 778         __ movptr(Address(rsp, st_off), rax);
 779 
 780       } else {
 781 
 782         __ movq(rax, Address(rsp, ld_off));
 783 
 784         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 785         // T_DOUBLE and T_LONG use two slots in the interpreter
 786         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 787           // ld_off == LSW, ld_off+wordSize == MSW
 788           // st_off == MSW, next_off == LSW
 789           __ movq(Address(rsp, next_off), rax);
 790 #ifdef ASSERT
 791           // Overwrite the unused slot with known junk
 792           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 793           __ movptr(Address(rsp, st_off), rax);
 794 #endif /* ASSERT */
 795         } else {
 796           __ movq(Address(rsp, st_off), rax);
 797         }
 798       }
 799     } else if (r_1->is_Register()) {
 800       Register r = r_1->as_Register();
 801       if (!r_2->is_valid()) {
 802         // must be only an int (or less ) so move only 32bits to slot
 803         // why not sign extend??
 804         __ movl(Address(rsp, st_off), r);
 805       } else {
 806         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 807         // T_DOUBLE and T_LONG use two slots in the interpreter
 808         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 809           // long/double in gpr
 810 #ifdef ASSERT
 811           // Overwrite the unused slot with known junk
 812           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 813           __ movptr(Address(rsp, st_off), rax);
 814 #endif /* ASSERT */
 815           __ movq(Address(rsp, next_off), r);
 816         } else {
 817           __ movptr(Address(rsp, st_off), r);
 818         }
 819       }
 820     } else {
 821       assert(r_1->is_XMMRegister(), "");
 822       if (!r_2->is_valid()) {
 823         // only a float use just part of the slot
 824         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 825       } else {
 826 #ifdef ASSERT
 827         // Overwrite the unused slot with known junk
 828         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 829         __ movptr(Address(rsp, st_off), rax);
 830 #endif /* ASSERT */
 831         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 832       }
 833     }
 834   }
 835 
 836   // Schedule the branch target address early.
 837   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 838   __ jmp(rcx);
 839 }
 840 
 841 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 842                         address code_start, address code_end,
 843                         Label& L_ok) {
 844   Label L_fail;
 845   __ lea(temp_reg, ExternalAddress(code_start));
 846   __ cmpptr(pc_reg, temp_reg);
 847   __ jcc(Assembler::belowEqual, L_fail);
 848   __ lea(temp_reg, ExternalAddress(code_end));
 849   __ cmpptr(pc_reg, temp_reg);
 850   __ jcc(Assembler::below, L_ok);
 851   __ bind(L_fail);
 852 }
 853 
 854 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 855                                     int total_args_passed,
 856                                     int comp_args_on_stack,
 857                                     const BasicType *sig_bt,
 858                                     const VMRegPair *regs) {
 859 
 860   // Note: r13 contains the senderSP on entry. We must preserve it since
 861   // we may do a i2c -> c2i transition if we lose a race where compiled
 862   // code goes non-entrant while we get args ready.
 863   // In addition we use r13 to locate all the interpreter args as
 864   // we must align the stack to 16 bytes on an i2c entry else we
 865   // lose alignment we expect in all compiled code and register
 866   // save code can segv when fxsave instructions find improperly
 867   // aligned stack pointer.
 868 
 869   // Adapters can be frameless because they do not require the caller
 870   // to perform additional cleanup work, such as correcting the stack pointer.
 871   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 872   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 873   // even if a callee has modified the stack pointer.
 874   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 875   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 876   // up via the senderSP register).
 877   // In other words, if *either* the caller or callee is interpreted, we can
 878   // get the stack pointer repaired after a call.
 879   // This is why c2i and i2c adapters cannot be indefinitely composed.
 880   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 881   // both caller and callee would be compiled methods, and neither would
 882   // clean up the stack pointer changes performed by the two adapters.
 883   // If this happens, control eventually transfers back to the compiled
 884   // caller, but with an uncorrected stack, causing delayed havoc.
 885 
 886   if (VerifyAdapterCalls &&
 887       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 888     // So, let's test for cascading c2i/i2c adapters right now.
 889     //  assert(Interpreter::contains($return_addr) ||
 890     //         StubRoutines::contains($return_addr),
 891     //         "i2c adapter must return to an interpreter frame");
 892     __ block_comment("verify_i2c { ");
 893     // Pick up the return address
 894     __ movptr(rax, Address(rsp, 0));
 895     Label L_ok;
 896     if (Interpreter::code() != nullptr) {
 897       range_check(masm, rax, r11,
 898                   Interpreter::code()->code_start(),
 899                   Interpreter::code()->code_end(),
 900                   L_ok);
 901     }
 902     if (StubRoutines::initial_stubs_code() != nullptr) {
 903       range_check(masm, rax, r11,
 904                   StubRoutines::initial_stubs_code()->code_begin(),
 905                   StubRoutines::initial_stubs_code()->code_end(),
 906                   L_ok);
 907     }
 908     if (StubRoutines::final_stubs_code() != nullptr) {
 909       range_check(masm, rax, r11,
 910                   StubRoutines::final_stubs_code()->code_begin(),
 911                   StubRoutines::final_stubs_code()->code_end(),
 912                   L_ok);
 913     }
 914     const char* msg = "i2c adapter must return to an interpreter frame";
 915     __ block_comment(msg);
 916     __ stop(msg);
 917     __ bind(L_ok);
 918     __ block_comment("} verify_i2ce ");
 919   }
 920 
 921   // Must preserve original SP for loading incoming arguments because
 922   // we need to align the outgoing SP for compiled code.
 923   __ movptr(r11, rsp);
 924 
 925   // Pick up the return address
 926   __ pop(rax);
 927 
 928   // Convert 4-byte c2 stack slots to words.
 929   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 930 
 931   if (comp_args_on_stack) {
 932     __ subptr(rsp, comp_words_on_stack * wordSize);
 933   }
 934 
 935   // Ensure compiled code always sees stack at proper alignment
 936   __ andptr(rsp, -16);
 937 
 938   // push the return address and misalign the stack that youngest frame always sees
 939   // as far as the placement of the call instruction
 940   __ push(rax);
 941 
 942   // Put saved SP in another register
 943   const Register saved_sp = rax;
 944   __ movptr(saved_sp, r11);
 945 
 946   // Will jump to the compiled code just as if compiled code was doing it.
 947   // Pre-load the register-jump target early, to schedule it better.
 948   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 949 
 950 #if INCLUDE_JVMCI
 951   if (EnableJVMCI) {
 952     // check if this call should be routed towards a specific entry point
 953     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 954     Label no_alternative_target;
 955     __ jcc(Assembler::equal, no_alternative_target);
 956     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 957     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 958     __ bind(no_alternative_target);
 959   }
 960 #endif // INCLUDE_JVMCI
 961 
 962   // Now generate the shuffle code.  Pick up all register args and move the
 963   // rest through the floating point stack top.
 964   for (int i = 0; i < total_args_passed; i++) {
 965     if (sig_bt[i] == T_VOID) {
 966       // Longs and doubles are passed in native word order, but misaligned
 967       // in the 32-bit build.
 968       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 969       continue;
 970     }
 971 
 972     // Pick up 0, 1 or 2 words from SP+offset.
 973 
 974     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 975             "scrambled load targets?");
 976     // Load in argument order going down.
 977     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 978     // Point to interpreter value (vs. tag)
 979     int next_off = ld_off - Interpreter::stackElementSize;
 980     //
 981     //
 982     //
 983     VMReg r_1 = regs[i].first();
 984     VMReg r_2 = regs[i].second();
 985     if (!r_1->is_valid()) {
 986       assert(!r_2->is_valid(), "");
 987       continue;
 988     }
 989     if (r_1->is_stack()) {
 990       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 991       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 992 
 993       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 994       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 995       // will be generated.
 996       if (!r_2->is_valid()) {
 997         // sign extend???
 998         __ movl(r13, Address(saved_sp, ld_off));
 999         __ movptr(Address(rsp, st_off), r13);
1000       } else {
1001         //
1002         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1003         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1004         // So we must adjust where to pick up the data to match the interpreter.
1005         //
1006         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1007         // are accessed as negative so LSW is at LOW address
1008 
1009         // ld_off is MSW so get LSW
1010         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
1011                            next_off : ld_off;
1012         __ movq(r13, Address(saved_sp, offset));
1013         // st_off is LSW (i.e. reg.first())
1014         __ movq(Address(rsp, st_off), r13);
1015       }
1016     } else if (r_1->is_Register()) {  // Register argument
1017       Register r = r_1->as_Register();
1018       assert(r != rax, "must be different");
1019       if (r_2->is_valid()) {
1020         //
1021         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1022         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1023         // So we must adjust where to pick up the data to match the interpreter.
1024 
1025         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
1026                            next_off : ld_off;
1027 
1028         // this can be a misaligned move
1029         __ movq(r, Address(saved_sp, offset));
1030       } else {
1031         // sign extend and use a full word?
1032         __ movl(r, Address(saved_sp, ld_off));
1033       }
1034     } else {
1035       if (!r_2->is_valid()) {
1036         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1037       } else {
1038         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1039       }
1040     }
1041   }
1042 
1043   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1044 
1045   // 6243940 We might end up in handle_wrong_method if
1046   // the callee is deoptimized as we race thru here. If that
1047   // happens we don't want to take a safepoint because the
1048   // caller frame will look interpreted and arguments are now
1049   // "compiled" so it is much better to make this transition
1050   // invisible to the stack walking code. Unfortunately if
1051   // we try and find the callee by normal means a safepoint
1052   // is possible. So we stash the desired callee in the thread
1053   // and the vm will find there should this case occur.
1054 
1055   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1056 
1057   // put Method* where a c2i would expect should we end up there
1058   // only needed because eof c2 resolve stubs return Method* as a result in
1059   // rax
1060   __ mov(rax, rbx);
1061   __ jmp(r11);
1062 }
1063 
1064 // ---------------------------------------------------------------
1065 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1066                                                             int total_args_passed,
1067                                                             int comp_args_on_stack,
1068                                                             const BasicType *sig_bt,
1069                                                             const VMRegPair *regs,
1070                                                             AdapterFingerPrint* fingerprint) {
1071   address i2c_entry = __ pc();
1072 
1073   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1074 
1075   // -------------------------------------------------------------------------
1076   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1077   // to the interpreter.  The args start out packed in the compiled layout.  They
1078   // need to be unpacked into the interpreter layout.  This will almost always
1079   // require some stack space.  We grow the current (compiled) stack, then repack
1080   // the args.  We  finally end in a jump to the generic interpreter entry point.
1081   // On exit from the interpreter, the interpreter will restore our SP (lest the
1082   // compiled code, which relies solely on SP and not RBP, get sick).
1083 
1084   address c2i_unverified_entry = __ pc();
1085   Label skip_fixup;
1086 
1087   Register data = rax;
1088   Register receiver = j_rarg0;
1089   Register temp = rbx;
1090 
1091   {
1092     __ ic_check(1 /* end_alignment */);
1093     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1094     // Method might have been compiled since the call site was patched to
1095     // interpreted if that is the case treat it as a miss so we can get
1096     // the call site corrected.
1097     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1098     __ jcc(Assembler::equal, skip_fixup);
1099     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1100   }
1101 
1102   address c2i_entry = __ pc();
1103 
1104   // Class initialization barrier for static methods
1105   address c2i_no_clinit_check_entry = nullptr;
1106   if (VM_Version::supports_fast_class_init_checks()) {
1107     Label L_skip_barrier;
1108     Register method = rbx;
1109 
1110     { // Bypass the barrier for non-static methods
1111       Register flags = rscratch1;
1112       __ movl(flags, Address(method, Method::access_flags_offset()));
1113       __ testl(flags, JVM_ACC_STATIC);
1114       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1115     }
1116 
1117     Register klass = rscratch1;
1118     __ load_method_holder(klass, method);
1119     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1120 
1121     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1122 
1123     __ bind(L_skip_barrier);
1124     c2i_no_clinit_check_entry = __ pc();
1125   }
1126 
1127   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1128   bs->c2i_entry_barrier(masm);
1129 
1130   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1131 
1132   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1133 }
1134 
1135 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1136                                          VMRegPair *regs,
1137                                          int total_args_passed) {
1138 
1139 // We return the amount of VMRegImpl stack slots we need to reserve for all
1140 // the arguments NOT counting out_preserve_stack_slots.
1141 
1142 // NOTE: These arrays will have to change when c1 is ported
1143 #ifdef _WIN64
1144     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1145       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1146     };
1147     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1148       c_farg0, c_farg1, c_farg2, c_farg3
1149     };
1150 #else
1151     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1152       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1153     };
1154     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1155       c_farg0, c_farg1, c_farg2, c_farg3,
1156       c_farg4, c_farg5, c_farg6, c_farg7
1157     };
1158 #endif // _WIN64
1159 
1160 
1161     uint int_args = 0;
1162     uint fp_args = 0;
1163     uint stk_args = 0; // inc by 2 each time
1164 
1165     for (int i = 0; i < total_args_passed; i++) {
1166       switch (sig_bt[i]) {
1167       case T_BOOLEAN:
1168       case T_CHAR:
1169       case T_BYTE:
1170       case T_SHORT:
1171       case T_INT:
1172         if (int_args < Argument::n_int_register_parameters_c) {
1173           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1174 #ifdef _WIN64
1175           fp_args++;
1176           // Allocate slots for callee to stuff register args the stack.
1177           stk_args += 2;
1178 #endif
1179         } else {
1180           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1181           stk_args += 2;
1182         }
1183         break;
1184       case T_LONG:
1185         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1186         // fall through
1187       case T_OBJECT:
1188       case T_ARRAY:
1189       case T_ADDRESS:
1190       case T_METADATA:
1191         if (int_args < Argument::n_int_register_parameters_c) {
1192           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1193 #ifdef _WIN64
1194           fp_args++;
1195           stk_args += 2;
1196 #endif
1197         } else {
1198           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1199           stk_args += 2;
1200         }
1201         break;
1202       case T_FLOAT:
1203         if (fp_args < Argument::n_float_register_parameters_c) {
1204           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1205 #ifdef _WIN64
1206           int_args++;
1207           // Allocate slots for callee to stuff register args the stack.
1208           stk_args += 2;
1209 #endif
1210         } else {
1211           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1212           stk_args += 2;
1213         }
1214         break;
1215       case T_DOUBLE:
1216         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1217         if (fp_args < Argument::n_float_register_parameters_c) {
1218           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1219 #ifdef _WIN64
1220           int_args++;
1221           // Allocate slots for callee to stuff register args the stack.
1222           stk_args += 2;
1223 #endif
1224         } else {
1225           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1226           stk_args += 2;
1227         }
1228         break;
1229       case T_VOID: // Halves of longs and doubles
1230         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1231         regs[i].set_bad();
1232         break;
1233       default:
1234         ShouldNotReachHere();
1235         break;
1236       }
1237     }
1238 #ifdef _WIN64
1239   // windows abi requires that we always allocate enough stack space
1240   // for 4 64bit registers to be stored down.
1241   if (stk_args < 8) {
1242     stk_args = 8;
1243   }
1244 #endif // _WIN64
1245 
1246   return stk_args;
1247 }
1248 
1249 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1250                                              uint num_bits,
1251                                              uint total_args_passed) {
1252   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1253          "only certain vector sizes are supported for now");
1254 
1255   static const XMMRegister VEC_ArgReg[32] = {
1256      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1257      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1258     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1259     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1260   };
1261 
1262   uint stk_args = 0;
1263   uint fp_args = 0;
1264 
1265   for (uint i = 0; i < total_args_passed; i++) {
1266     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1267     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1268     regs[i].set_pair(vmreg->next(next_val), vmreg);
1269   }
1270 
1271   return stk_args;
1272 }
1273 
1274 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1275   // We always ignore the frame_slots arg and just use the space just below frame pointer
1276   // which by this time is free to use
1277   switch (ret_type) {
1278   case T_FLOAT:
1279     __ movflt(Address(rbp, -wordSize), xmm0);
1280     break;
1281   case T_DOUBLE:
1282     __ movdbl(Address(rbp, -wordSize), xmm0);
1283     break;
1284   case T_VOID:  break;
1285   default: {
1286     __ movptr(Address(rbp, -wordSize), rax);
1287     }
1288   }
1289 }
1290 
1291 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1292   // We always ignore the frame_slots arg and just use the space just below frame pointer
1293   // which by this time is free to use
1294   switch (ret_type) {
1295   case T_FLOAT:
1296     __ movflt(xmm0, Address(rbp, -wordSize));
1297     break;
1298   case T_DOUBLE:
1299     __ movdbl(xmm0, Address(rbp, -wordSize));
1300     break;
1301   case T_VOID:  break;
1302   default: {
1303     __ movptr(rax, Address(rbp, -wordSize));
1304     }
1305   }
1306 }
1307 
1308 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1309     for ( int i = first_arg ; i < arg_count ; i++ ) {
1310       if (args[i].first()->is_Register()) {
1311         __ push(args[i].first()->as_Register());
1312       } else if (args[i].first()->is_XMMRegister()) {
1313         __ subptr(rsp, 2*wordSize);
1314         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1315       }
1316     }
1317 }
1318 
1319 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1320     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1321       if (args[i].first()->is_Register()) {
1322         __ pop(args[i].first()->as_Register());
1323       } else if (args[i].first()->is_XMMRegister()) {
1324         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1325         __ addptr(rsp, 2*wordSize);
1326       }
1327     }
1328 }
1329 
1330 static void verify_oop_args(MacroAssembler* masm,
1331                             const methodHandle& method,
1332                             const BasicType* sig_bt,
1333                             const VMRegPair* regs) {
1334   Register temp_reg = rbx;  // not part of any compiled calling seq
1335   if (VerifyOops) {
1336     for (int i = 0; i < method->size_of_parameters(); i++) {
1337       if (is_reference_type(sig_bt[i])) {
1338         VMReg r = regs[i].first();
1339         assert(r->is_valid(), "bad oop arg");
1340         if (r->is_stack()) {
1341           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1342           __ verify_oop(temp_reg);
1343         } else {
1344           __ verify_oop(r->as_Register());
1345         }
1346       }
1347     }
1348   }
1349 }
1350 
1351 static void check_continuation_enter_argument(VMReg actual_vmreg,
1352                                               Register expected_reg,
1353                                               const char* name) {
1354   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1355   assert(actual_vmreg->as_Register() == expected_reg,
1356          "%s is in unexpected register: %s instead of %s",
1357          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1358 }
1359 
1360 
1361 //---------------------------- continuation_enter_setup ---------------------------
1362 //
1363 // Arguments:
1364 //   None.
1365 //
1366 // Results:
1367 //   rsp: pointer to blank ContinuationEntry
1368 //
1369 // Kills:
1370 //   rax
1371 //
1372 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1373   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1374   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1375   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1376 
1377   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1378   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1379 
1380   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1381   OopMap* map = new OopMap(frame_size, 0);
1382 
1383   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1384   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1385   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1386 
1387   return map;
1388 }
1389 
1390 //---------------------------- fill_continuation_entry ---------------------------
1391 //
1392 // Arguments:
1393 //   rsp: pointer to blank Continuation entry
1394 //   reg_cont_obj: pointer to the continuation
1395 //   reg_flags: flags
1396 //
1397 // Results:
1398 //   rsp: pointer to filled out ContinuationEntry
1399 //
1400 // Kills:
1401 //   rax
1402 //
1403 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1404   assert_different_registers(rax, reg_cont_obj, reg_flags);
1405 #ifdef ASSERT
1406   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1407 #endif
1408   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1409   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1410   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1411   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1412   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1413 
1414   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1415   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1416   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1417   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1418 
1419   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1420   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1421 }
1422 
1423 //---------------------------- continuation_enter_cleanup ---------------------------
1424 //
1425 // Arguments:
1426 //   rsp: pointer to the ContinuationEntry
1427 //
1428 // Results:
1429 //   rsp: pointer to the spilled rbp in the entry frame
1430 //
1431 // Kills:
1432 //   rbx
1433 //
1434 void static continuation_enter_cleanup(MacroAssembler* masm) {
1435 #ifdef ASSERT
1436   Label L_good_sp;
1437   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1438   __ jcc(Assembler::equal, L_good_sp);
1439   __ stop("Incorrect rsp at continuation_enter_cleanup");
1440   __ bind(L_good_sp);
1441 #endif
1442   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1443   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1444 
1445   if (CheckJNICalls) {
1446     // Check if this is a virtual thread continuation
1447     Label L_skip_vthread_code;
1448     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1449     __ jcc(Assembler::equal, L_skip_vthread_code);
1450 
1451     // If the held monitor count is > 0 and this vthread is terminating then
1452     // it failed to release a JNI monitor. So we issue the same log message
1453     // that JavaThread::exit does.
1454     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1455     __ jcc(Assembler::equal, L_skip_vthread_code);
1456 
1457     // rax may hold an exception oop, save it before the call
1458     __ push(rax);
1459     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1460     __ pop(rax);
1461 
1462     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1463     // on termination. The held count is implicitly zeroed below when we restore from
1464     // the parent held count (which has to be zero).
1465     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1466 
1467     __ bind(L_skip_vthread_code);
1468   }
1469 #ifdef ASSERT
1470   else {
1471     // Check if this is a virtual thread continuation
1472     Label L_skip_vthread_code;
1473     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1474     __ jcc(Assembler::equal, L_skip_vthread_code);
1475 
1476     // See comment just above. If not checking JNI calls the JNI count is only
1477     // needed for assertion checking.
1478     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1479 
1480     __ bind(L_skip_vthread_code);
1481   }
1482 #endif
1483 
1484   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1485   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1486 
1487   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1488   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1489   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1490 }
1491 
1492 static void gen_continuation_enter(MacroAssembler* masm,
1493                                    const VMRegPair* regs,
1494                                    int& exception_offset,
1495                                    OopMapSet* oop_maps,
1496                                    int& frame_complete,
1497                                    int& stack_slots,
1498                                    int& interpreted_entry_offset,
1499                                    int& compiled_entry_offset) {
1500 
1501   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1502   int pos_cont_obj   = 0;
1503   int pos_is_cont    = 1;
1504   int pos_is_virtual = 2;
1505 
1506   // The platform-specific calling convention may present the arguments in various registers.
1507   // To simplify the rest of the code, we expect the arguments to reside at these known
1508   // registers, and we additionally check the placement here in case calling convention ever
1509   // changes.
1510   Register reg_cont_obj   = c_rarg1;
1511   Register reg_is_cont    = c_rarg2;
1512   Register reg_is_virtual = c_rarg3;
1513 
1514   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1515   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1516   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1517 
1518   // Utility methods kill rax, make sure there are no collisions
1519   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1520 
1521   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1522                          relocInfo::static_call_type);
1523 
1524   address start = __ pc();
1525 
1526   Label L_thaw, L_exit;
1527 
1528   // i2i entry used at interp_only_mode only
1529   interpreted_entry_offset = __ pc() - start;
1530   {
1531 #ifdef ASSERT
1532     Label is_interp_only;
1533     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1534     __ jcc(Assembler::notEqual, is_interp_only);
1535     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1536     __ bind(is_interp_only);
1537 #endif
1538 
1539     __ pop(rax); // return address
1540     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1541     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1542     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1543     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1544     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1545     __ push(rax); // return address
1546     __ push_cont_fastpath();
1547 
1548     __ enter();
1549 
1550     stack_slots = 2; // will be adjusted in setup
1551     OopMap* map = continuation_enter_setup(masm, stack_slots);
1552     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1553     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1554 
1555     __ verify_oop(reg_cont_obj);
1556 
1557     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1558 
1559     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1560     __ testptr(reg_is_cont, reg_is_cont);
1561     __ jcc(Assembler::notZero, L_thaw);
1562 
1563     // --- Resolve path
1564 
1565     // Make sure the call is patchable
1566     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1567     // Emit stub for static call
1568     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1569     if (stub == nullptr) {
1570       fatal("CodeCache is full at gen_continuation_enter");
1571     }
1572     __ call(resolve);
1573     oop_maps->add_gc_map(__ pc() - start, map);
1574     __ post_call_nop();
1575 
1576     __ jmp(L_exit);
1577   }
1578 
1579   // compiled entry
1580   __ align(CodeEntryAlignment);
1581   compiled_entry_offset = __ pc() - start;
1582   __ enter();
1583 
1584   stack_slots = 2; // will be adjusted in setup
1585   OopMap* map = continuation_enter_setup(masm, stack_slots);
1586 
1587   // Frame is now completed as far as size and linkage.
1588   frame_complete = __ pc() - start;
1589 
1590   __ verify_oop(reg_cont_obj);
1591 
1592   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1593 
1594   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1595   __ testptr(reg_is_cont, reg_is_cont);
1596   __ jccb(Assembler::notZero, L_thaw);
1597 
1598   // --- call Continuation.enter(Continuation c, boolean isContinue)
1599 
1600   // Make sure the call is patchable
1601   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1602 
1603   // Emit stub for static call
1604   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1605   if (stub == nullptr) {
1606     fatal("CodeCache is full at gen_continuation_enter");
1607   }
1608 
1609   // The call needs to be resolved. There's a special case for this in
1610   // SharedRuntime::find_callee_info_helper() which calls
1611   // LinkResolver::resolve_continuation_enter() which resolves the call to
1612   // Continuation.enter(Continuation c, boolean isContinue).
1613   __ call(resolve);
1614 
1615   oop_maps->add_gc_map(__ pc() - start, map);
1616   __ post_call_nop();
1617 
1618   __ jmpb(L_exit);
1619 
1620   // --- Thawing path
1621 
1622   __ bind(L_thaw);
1623 
1624   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1625 
1626   ContinuationEntry::_return_pc_offset = __ pc() - start;
1627   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1628   __ post_call_nop();
1629 
1630   // --- Normal exit (resolve/thawing)
1631 
1632   __ bind(L_exit);
1633 
1634   continuation_enter_cleanup(masm);
1635   __ pop(rbp);
1636   __ ret(0);
1637 
1638   // --- Exception handling path
1639 
1640   exception_offset = __ pc() - start;
1641 
1642   continuation_enter_cleanup(masm);
1643   __ pop(rbp);
1644 
1645   __ movptr(c_rarg0, r15_thread);
1646   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1647 
1648   // rax still holds the original exception oop, save it before the call
1649   __ push(rax);
1650 
1651   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1652   __ movptr(rbx, rax);
1653 
1654   // Continue at exception handler:
1655   //   rax: exception oop
1656   //   rbx: exception handler
1657   //   rdx: exception pc
1658   __ pop(rax);
1659   __ verify_oop(rax);
1660   __ pop(rdx);
1661   __ jmp(rbx);
1662 }
1663 
1664 static void gen_continuation_yield(MacroAssembler* masm,
1665                                    const VMRegPair* regs,
1666                                    OopMapSet* oop_maps,
1667                                    int& frame_complete,
1668                                    int& stack_slots,
1669                                    int& compiled_entry_offset) {
1670   enum layout {
1671     rbp_off,
1672     rbpH_off,
1673     return_off,
1674     return_off2,
1675     framesize // inclusive of return address
1676   };
1677   stack_slots = framesize /  VMRegImpl::slots_per_word;
1678   assert(stack_slots == 2, "recheck layout");
1679 
1680   address start = __ pc();
1681   compiled_entry_offset = __ pc() - start;
1682   __ enter();
1683   address the_pc = __ pc();
1684 
1685   frame_complete = the_pc - start;
1686 
1687   // This nop must be exactly at the PC we push into the frame info.
1688   // We use this nop for fast CodeBlob lookup, associate the OopMap
1689   // with it right away.
1690   __ post_call_nop();
1691   OopMap* map = new OopMap(framesize, 1);
1692   oop_maps->add_gc_map(frame_complete, map);
1693 
1694   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1695   __ movptr(c_rarg0, r15_thread);
1696   __ movptr(c_rarg1, rsp);
1697   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1698   __ reset_last_Java_frame(true);
1699 
1700   Label L_pinned;
1701 
1702   __ testptr(rax, rax);
1703   __ jcc(Assembler::notZero, L_pinned);
1704 
1705   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1706   continuation_enter_cleanup(masm);
1707   __ pop(rbp);
1708   __ ret(0);
1709 
1710   __ bind(L_pinned);
1711 
1712   // Pinned, return to caller
1713 
1714   // handle pending exception thrown by freeze
1715   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1716   Label ok;
1717   __ jcc(Assembler::equal, ok);
1718   __ leave();
1719   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1720   __ bind(ok);
1721 
1722   __ leave();
1723   __ ret(0);
1724 }
1725 
1726 static void gen_special_dispatch(MacroAssembler* masm,
1727                                  const methodHandle& method,
1728                                  const BasicType* sig_bt,
1729                                  const VMRegPair* regs) {
1730   verify_oop_args(masm, method, sig_bt, regs);
1731   vmIntrinsics::ID iid = method->intrinsic_id();
1732 
1733   // Now write the args into the outgoing interpreter space
1734   bool     has_receiver   = false;
1735   Register receiver_reg   = noreg;
1736   int      member_arg_pos = -1;
1737   Register member_reg     = noreg;
1738   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1739   if (ref_kind != 0) {
1740     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1741     member_reg = rbx;  // known to be free at this point
1742     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1743   } else if (iid == vmIntrinsics::_invokeBasic) {
1744     has_receiver = true;
1745   } else if (iid == vmIntrinsics::_linkToNative) {
1746     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1747     member_reg = rbx;  // known to be free at this point
1748   } else {
1749     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1750   }
1751 
1752   if (member_reg != noreg) {
1753     // Load the member_arg into register, if necessary.
1754     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1755     VMReg r = regs[member_arg_pos].first();
1756     if (r->is_stack()) {
1757       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1758     } else {
1759       // no data motion is needed
1760       member_reg = r->as_Register();
1761     }
1762   }
1763 
1764   if (has_receiver) {
1765     // Make sure the receiver is loaded into a register.
1766     assert(method->size_of_parameters() > 0, "oob");
1767     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1768     VMReg r = regs[0].first();
1769     assert(r->is_valid(), "bad receiver arg");
1770     if (r->is_stack()) {
1771       // Porting note:  This assumes that compiled calling conventions always
1772       // pass the receiver oop in a register.  If this is not true on some
1773       // platform, pick a temp and load the receiver from stack.
1774       fatal("receiver always in a register");
1775       receiver_reg = j_rarg0;  // known to be free at this point
1776       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1777     } else {
1778       // no data motion is needed
1779       receiver_reg = r->as_Register();
1780     }
1781   }
1782 
1783   // Figure out which address we are really jumping to:
1784   MethodHandles::generate_method_handle_dispatch(masm, iid,
1785                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1786 }
1787 
1788 // ---------------------------------------------------------------------------
1789 // Generate a native wrapper for a given method.  The method takes arguments
1790 // in the Java compiled code convention, marshals them to the native
1791 // convention (handlizes oops, etc), transitions to native, makes the call,
1792 // returns to java state (possibly blocking), unhandlizes any result and
1793 // returns.
1794 //
1795 // Critical native functions are a shorthand for the use of
1796 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1797 // functions.  The wrapper is expected to unpack the arguments before
1798 // passing them to the callee. Critical native functions leave the state _in_Java,
1799 // since they cannot stop for GC.
1800 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1801 // block and the check for pending exceptions it's impossible for them
1802 // to be thrown.
1803 //
1804 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1805                                                 const methodHandle& method,
1806                                                 int compile_id,
1807                                                 BasicType* in_sig_bt,
1808                                                 VMRegPair* in_regs,
1809                                                 BasicType ret_type) {
1810   if (method->is_continuation_native_intrinsic()) {
1811     int exception_offset = -1;
1812     OopMapSet* oop_maps = new OopMapSet();
1813     int frame_complete = -1;
1814     int stack_slots = -1;
1815     int interpreted_entry_offset = -1;
1816     int vep_offset = -1;
1817     if (method->is_continuation_enter_intrinsic()) {
1818       gen_continuation_enter(masm,
1819                              in_regs,
1820                              exception_offset,
1821                              oop_maps,
1822                              frame_complete,
1823                              stack_slots,
1824                              interpreted_entry_offset,
1825                              vep_offset);
1826     } else if (method->is_continuation_yield_intrinsic()) {
1827       gen_continuation_yield(masm,
1828                              in_regs,
1829                              oop_maps,
1830                              frame_complete,
1831                              stack_slots,
1832                              vep_offset);
1833     } else {
1834       guarantee(false, "Unknown Continuation native intrinsic");
1835     }
1836 
1837 #ifdef ASSERT
1838     if (method->is_continuation_enter_intrinsic()) {
1839       assert(interpreted_entry_offset != -1, "Must be set");
1840       assert(exception_offset != -1,         "Must be set");
1841     } else {
1842       assert(interpreted_entry_offset == -1, "Must be unset");
1843       assert(exception_offset == -1,         "Must be unset");
1844     }
1845     assert(frame_complete != -1,    "Must be set");
1846     assert(stack_slots != -1,       "Must be set");
1847     assert(vep_offset != -1,        "Must be set");
1848 #endif
1849 
1850     __ flush();
1851     nmethod* nm = nmethod::new_native_nmethod(method,
1852                                               compile_id,
1853                                               masm->code(),
1854                                               vep_offset,
1855                                               frame_complete,
1856                                               stack_slots,
1857                                               in_ByteSize(-1),
1858                                               in_ByteSize(-1),
1859                                               oop_maps,
1860                                               exception_offset);
1861     if (nm == nullptr) return nm;
1862     if (method->is_continuation_enter_intrinsic()) {
1863       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1864     } else if (method->is_continuation_yield_intrinsic()) {
1865       _cont_doYield_stub = nm;
1866     }
1867     return nm;
1868   }
1869 
1870   if (method->is_method_handle_intrinsic()) {
1871     vmIntrinsics::ID iid = method->intrinsic_id();
1872     intptr_t start = (intptr_t)__ pc();
1873     int vep_offset = ((intptr_t)__ pc()) - start;
1874     gen_special_dispatch(masm,
1875                          method,
1876                          in_sig_bt,
1877                          in_regs);
1878     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1879     __ flush();
1880     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1881     return nmethod::new_native_nmethod(method,
1882                                        compile_id,
1883                                        masm->code(),
1884                                        vep_offset,
1885                                        frame_complete,
1886                                        stack_slots / VMRegImpl::slots_per_word,
1887                                        in_ByteSize(-1),
1888                                        in_ByteSize(-1),
1889                                        nullptr);
1890   }
1891   address native_func = method->native_function();
1892   assert(native_func != nullptr, "must have function");
1893 
1894   // An OopMap for lock (and class if static)
1895   OopMapSet *oop_maps = new OopMapSet();
1896   intptr_t start = (intptr_t)__ pc();
1897 
1898   // We have received a description of where all the java arg are located
1899   // on entry to the wrapper. We need to convert these args to where
1900   // the jni function will expect them. To figure out where they go
1901   // we convert the java signature to a C signature by inserting
1902   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1903 
1904   const int total_in_args = method->size_of_parameters();
1905   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1906 
1907   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1908   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1909   BasicType* in_elem_bt = nullptr;
1910 
1911   int argc = 0;
1912   out_sig_bt[argc++] = T_ADDRESS;
1913   if (method->is_static()) {
1914     out_sig_bt[argc++] = T_OBJECT;
1915   }
1916 
1917   for (int i = 0; i < total_in_args ; i++ ) {
1918     out_sig_bt[argc++] = in_sig_bt[i];
1919   }
1920 
1921   // Now figure out where the args must be stored and how much stack space
1922   // they require.
1923   int out_arg_slots;
1924   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1925 
1926   // Compute framesize for the wrapper.  We need to handlize all oops in
1927   // incoming registers
1928 
1929   // Calculate the total number of stack slots we will need.
1930 
1931   // First count the abi requirement plus all of the outgoing args
1932   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1933 
1934   // Now the space for the inbound oop handle area
1935   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1936 
1937   int oop_handle_offset = stack_slots;
1938   stack_slots += total_save_slots;
1939 
1940   // Now any space we need for handlizing a klass if static method
1941 
1942   int klass_slot_offset = 0;
1943   int klass_offset = -1;
1944   int lock_slot_offset = 0;
1945   bool is_static = false;
1946 
1947   if (method->is_static()) {
1948     klass_slot_offset = stack_slots;
1949     stack_slots += VMRegImpl::slots_per_word;
1950     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1951     is_static = true;
1952   }
1953 
1954   // Plus a lock if needed
1955 
1956   if (method->is_synchronized()) {
1957     lock_slot_offset = stack_slots;
1958     stack_slots += VMRegImpl::slots_per_word;
1959   }
1960 
1961   // Now a place (+2) to save return values or temp during shuffling
1962   // + 4 for return address (which we own) and saved rbp
1963   stack_slots += 6;
1964 
1965   // Ok The space we have allocated will look like:
1966   //
1967   //
1968   // FP-> |                     |
1969   //      |---------------------|
1970   //      | 2 slots for moves   |
1971   //      |---------------------|
1972   //      | lock box (if sync)  |
1973   //      |---------------------| <- lock_slot_offset
1974   //      | klass (if static)   |
1975   //      |---------------------| <- klass_slot_offset
1976   //      | oopHandle area      |
1977   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1978   //      | outbound memory     |
1979   //      | based arguments     |
1980   //      |                     |
1981   //      |---------------------|
1982   //      |                     |
1983   // SP-> | out_preserved_slots |
1984   //
1985   //
1986 
1987 
1988   // Now compute actual number of stack words we need rounding to make
1989   // stack properly aligned.
1990   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1991 
1992   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1993 
1994   // First thing make an ic check to see if we should even be here
1995 
1996   // We are free to use all registers as temps without saving them and
1997   // restoring them except rbp. rbp is the only callee save register
1998   // as far as the interpreter and the compiler(s) are concerned.
1999 
2000   const Register receiver = j_rarg0;
2001 
2002   Label exception_pending;
2003 
2004   assert_different_registers(receiver, rscratch1, rscratch2);
2005   __ verify_oop(receiver);
2006   __ ic_check(8 /* end_alignment */);
2007 
2008   int vep_offset = ((intptr_t)__ pc()) - start;
2009 
2010   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2011     Label L_skip_barrier;
2012     Register klass = r10;
2013     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2014     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2015 
2016     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2017 
2018     __ bind(L_skip_barrier);
2019   }
2020 
2021 #ifdef COMPILER1
2022   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2023   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2024     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2025   }
2026 #endif // COMPILER1
2027 
2028   // The instruction at the verified entry point must be 5 bytes or longer
2029   // because it can be patched on the fly by make_non_entrant. The stack bang
2030   // instruction fits that requirement.
2031 
2032   // Generate stack overflow check
2033   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2034 
2035   // Generate a new frame for the wrapper.
2036   __ enter();
2037   // -2 because return address is already present and so is saved rbp
2038   __ subptr(rsp, stack_size - 2*wordSize);
2039 
2040   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2041   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2042   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2043 
2044   // Frame is now completed as far as size and linkage.
2045   int frame_complete = ((intptr_t)__ pc()) - start;
2046 
2047 #ifdef ASSERT
2048   __ check_stack_alignment(rsp, "improperly aligned stack");
2049 #endif /* ASSERT */
2050 
2051 
2052   // We use r14 as the oop handle for the receiver/klass
2053   // It is callee save so it survives the call to native
2054 
2055   const Register oop_handle_reg = r14;
2056 
2057   //
2058   // We immediately shuffle the arguments so that any vm call we have to
2059   // make from here on out (sync slow path, jvmti, etc.) we will have
2060   // captured the oops from our caller and have a valid oopMap for
2061   // them.
2062 
2063   // -----------------
2064   // The Grand Shuffle
2065 
2066   // The Java calling convention is either equal (linux) or denser (win64) than the
2067   // c calling convention. However the because of the jni_env argument the c calling
2068   // convention always has at least one more (and two for static) arguments than Java.
2069   // Therefore if we move the args from java -> c backwards then we will never have
2070   // a register->register conflict and we don't have to build a dependency graph
2071   // and figure out how to break any cycles.
2072   //
2073 
2074   // Record esp-based slot for receiver on stack for non-static methods
2075   int receiver_offset = -1;
2076 
2077   // This is a trick. We double the stack slots so we can claim
2078   // the oops in the caller's frame. Since we are sure to have
2079   // more args than the caller doubling is enough to make
2080   // sure we can capture all the incoming oop args from the
2081   // caller.
2082   //
2083   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2084 
2085   // Mark location of rbp (someday)
2086   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2087 
2088   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2089   // All inbound args are referenced based on rbp and all outbound args via rsp.
2090 
2091 
2092 #ifdef ASSERT
2093   bool reg_destroyed[Register::number_of_registers];
2094   bool freg_destroyed[XMMRegister::number_of_registers];
2095   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2096     reg_destroyed[r] = false;
2097   }
2098   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2099     freg_destroyed[f] = false;
2100   }
2101 
2102 #endif /* ASSERT */
2103 
2104   // For JNI natives the incoming and outgoing registers are offset upwards.
2105   GrowableArray<int> arg_order(2 * total_in_args);
2106 
2107   VMRegPair tmp_vmreg;
2108   tmp_vmreg.set2(rbx->as_VMReg());
2109 
2110   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2111     arg_order.push(i);
2112     arg_order.push(c_arg);
2113   }
2114 
2115   int temploc = -1;
2116   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2117     int i = arg_order.at(ai);
2118     int c_arg = arg_order.at(ai + 1);
2119     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2120 #ifdef ASSERT
2121     if (in_regs[i].first()->is_Register()) {
2122       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2123     } else if (in_regs[i].first()->is_XMMRegister()) {
2124       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2125     }
2126     if (out_regs[c_arg].first()->is_Register()) {
2127       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2128     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2129       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2130     }
2131 #endif /* ASSERT */
2132     switch (in_sig_bt[i]) {
2133       case T_ARRAY:
2134       case T_OBJECT:
2135         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2136                     ((i == 0) && (!is_static)),
2137                     &receiver_offset);
2138         break;
2139       case T_VOID:
2140         break;
2141 
2142       case T_FLOAT:
2143         __ float_move(in_regs[i], out_regs[c_arg]);
2144           break;
2145 
2146       case T_DOUBLE:
2147         assert( i + 1 < total_in_args &&
2148                 in_sig_bt[i + 1] == T_VOID &&
2149                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2150         __ double_move(in_regs[i], out_regs[c_arg]);
2151         break;
2152 
2153       case T_LONG :
2154         __ long_move(in_regs[i], out_regs[c_arg]);
2155         break;
2156 
2157       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2158 
2159       default:
2160         __ move32_64(in_regs[i], out_regs[c_arg]);
2161     }
2162   }
2163 
2164   int c_arg;
2165 
2166   // Pre-load a static method's oop into r14.  Used both by locking code and
2167   // the normal JNI call code.
2168   // point c_arg at the first arg that is already loaded in case we
2169   // need to spill before we call out
2170   c_arg = total_c_args - total_in_args;
2171 
2172   if (method->is_static()) {
2173 
2174     //  load oop into a register
2175     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2176 
2177     // Now handlize the static class mirror it's known not-null.
2178     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2179     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2180 
2181     // Now get the handle
2182     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2183     // store the klass handle as second argument
2184     __ movptr(c_rarg1, oop_handle_reg);
2185     // and protect the arg if we must spill
2186     c_arg--;
2187   }
2188 
2189   // Change state to native (we save the return address in the thread, since it might not
2190   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2191   // points into the right code segment. It does not have to be the correct return pc.
2192   // We use the same pc/oopMap repeatedly when we call out
2193 
2194   intptr_t the_pc = (intptr_t) __ pc();
2195   oop_maps->add_gc_map(the_pc - start, map);
2196 
2197   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2198 
2199 
2200   // We have all of the arguments setup at this point. We must not touch any register
2201   // argument registers at this point (what if we save/restore them there are no oop?
2202 
2203   {
2204     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2205     // protect the args we've loaded
2206     save_args(masm, total_c_args, c_arg, out_regs);
2207     __ mov_metadata(c_rarg1, method());
2208     __ call_VM_leaf(
2209       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2210       r15_thread, c_rarg1);
2211     restore_args(masm, total_c_args, c_arg, out_regs);
2212   }
2213 
2214   // RedefineClasses() tracing support for obsolete method entry
2215   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2216     // protect the args we've loaded
2217     save_args(masm, total_c_args, c_arg, out_regs);
2218     __ mov_metadata(c_rarg1, method());
2219     __ call_VM_leaf(
2220       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2221       r15_thread, c_rarg1);
2222     restore_args(masm, total_c_args, c_arg, out_regs);
2223   }
2224 
2225   // Lock a synchronized method
2226 
2227   // Register definitions used by locking and unlocking
2228 
2229   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2230   const Register obj_reg  = rbx;  // Will contain the oop
2231   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2232   const Register old_hdr  = r13;  // value of old header at unlock time
2233 
2234   Label slow_path_lock;
2235   Label lock_done;
2236 
2237   if (method->is_synchronized()) {
2238     Label count_mon;
2239 
2240     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2241 
2242     // Get the handle (the 2nd argument)
2243     __ mov(oop_handle_reg, c_rarg1);
2244 
2245     // Get address of the box
2246 
2247     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2248 
2249     // Load the oop from the handle
2250     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2251 
2252     if (LockingMode == LM_MONITOR) {
2253       __ jmp(slow_path_lock);
2254     } else if (LockingMode == LM_LEGACY) {
2255       // Load immediate 1 into swap_reg %rax
2256       __ movl(swap_reg, 1);
2257 
2258       // Load (object->mark() | 1) into swap_reg %rax
2259       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2260 
2261       // Save (object->mark() | 1) into BasicLock's displaced header
2262       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2263 
2264       // src -> dest iff dest == rax else rax <- dest
2265       __ lock();
2266       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2267       __ jcc(Assembler::equal, count_mon);
2268 
2269       // Hmm should this move to the slow path code area???
2270 
2271       // Test if the oopMark is an obvious stack pointer, i.e.,
2272       //  1) (mark & 3) == 0, and
2273       //  2) rsp <= mark < mark + os::pagesize()
2274       // These 3 tests can be done by evaluating the following
2275       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2276       // assuming both stack pointer and pagesize have their
2277       // least significant 2 bits clear.
2278       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2279 
2280       __ subptr(swap_reg, rsp);
2281       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2282 
2283       // Save the test result, for recursive case, the result is zero
2284       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2285       __ jcc(Assembler::notEqual, slow_path_lock);
2286     } else {
2287       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2288       __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2289     }
2290     __ bind(count_mon);
2291     __ inc_held_monitor_count();
2292 
2293     // Slow path will re-enter here
2294     __ bind(lock_done);
2295   }
2296 
2297   // Finally just about ready to make the JNI call
2298 
2299   // get JNIEnv* which is first argument to native
2300   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2301 
2302   // Now set thread in native
2303   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2304 
2305   __ call(RuntimeAddress(native_func));
2306 
2307   // Verify or restore cpu control state after JNI call
2308   __ restore_cpu_control_state_after_jni(rscratch1);
2309 
2310   // Unpack native results.
2311   switch (ret_type) {
2312   case T_BOOLEAN: __ c2bool(rax);            break;
2313   case T_CHAR   : __ movzwl(rax, rax);      break;
2314   case T_BYTE   : __ sign_extend_byte (rax); break;
2315   case T_SHORT  : __ sign_extend_short(rax); break;
2316   case T_INT    : /* nothing to do */        break;
2317   case T_DOUBLE :
2318   case T_FLOAT  :
2319     // Result is in xmm0 we'll save as needed
2320     break;
2321   case T_ARRAY:                 // Really a handle
2322   case T_OBJECT:                // Really a handle
2323       break; // can't de-handlize until after safepoint check
2324   case T_VOID: break;
2325   case T_LONG: break;
2326   default       : ShouldNotReachHere();
2327   }
2328 
2329   Label after_transition;
2330 
2331   // Switch thread to "native transition" state before reading the synchronization state.
2332   // This additional state is necessary because reading and testing the synchronization
2333   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2334   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2335   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2336   //     Thread A is resumed to finish this native method, but doesn't block here since it
2337   //     didn't see any synchronization is progress, and escapes.
2338   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2339 
2340   // Force this write out before the read below
2341   if (!UseSystemMemoryBarrier) {
2342     __ membar(Assembler::Membar_mask_bits(
2343               Assembler::LoadLoad | Assembler::LoadStore |
2344               Assembler::StoreLoad | Assembler::StoreStore));
2345   }
2346 
2347   // check for safepoint operation in progress and/or pending suspend requests
2348   {
2349     Label Continue;
2350     Label slow_path;
2351 
2352     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2353 
2354     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2355     __ jcc(Assembler::equal, Continue);
2356     __ bind(slow_path);
2357 
2358     // Don't use call_VM as it will see a possible pending exception and forward it
2359     // and never return here preventing us from clearing _last_native_pc down below.
2360     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2361     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2362     // by hand.
2363     //
2364     __ vzeroupper();
2365     save_native_result(masm, ret_type, stack_slots);
2366     __ mov(c_rarg0, r15_thread);
2367     __ mov(r12, rsp); // remember sp
2368     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2369     __ andptr(rsp, -16); // align stack as required by ABI
2370     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2371     __ mov(rsp, r12); // restore sp
2372     __ reinit_heapbase();
2373     // Restore any method result value
2374     restore_native_result(masm, ret_type, stack_slots);
2375     __ bind(Continue);
2376   }
2377 
2378   // change thread state
2379   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2380   __ bind(after_transition);
2381 
2382   Label reguard;
2383   Label reguard_done;
2384   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2385   __ jcc(Assembler::equal, reguard);
2386   __ bind(reguard_done);
2387 
2388   // native result if any is live
2389 
2390   // Unlock
2391   Label slow_path_unlock;
2392   Label unlock_done;
2393   if (method->is_synchronized()) {
2394 
2395     Label fast_done;
2396 
2397     // Get locked oop from the handle we passed to jni
2398     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2399 
2400     if (LockingMode == LM_LEGACY) {
2401       Label not_recur;
2402       // Simple recursive lock?
2403       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2404       __ jcc(Assembler::notEqual, not_recur);
2405       __ dec_held_monitor_count();
2406       __ jmpb(fast_done);
2407       __ bind(not_recur);
2408     }
2409 
2410     // Must save rax if it is live now because cmpxchg must use it
2411     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2412       save_native_result(masm, ret_type, stack_slots);
2413     }
2414 
2415     if (LockingMode == LM_MONITOR) {
2416       __ jmp(slow_path_unlock);
2417     } else if (LockingMode == LM_LEGACY) {
2418       // get address of the stack lock
2419       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2420       //  get old displaced header
2421       __ movptr(old_hdr, Address(rax, 0));
2422 
2423       // Atomic swap old header if oop still contains the stack lock
2424       __ lock();
2425       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2426       __ jcc(Assembler::notEqual, slow_path_unlock);
2427       __ dec_held_monitor_count();
2428     } else {
2429       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2430       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2431       __ dec_held_monitor_count();
2432     }
2433 
2434     // slow path re-enters here
2435     __ bind(unlock_done);
2436     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2437       restore_native_result(masm, ret_type, stack_slots);
2438     }
2439 
2440     __ bind(fast_done);
2441   }
2442   {
2443     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2444     save_native_result(masm, ret_type, stack_slots);
2445     __ mov_metadata(c_rarg1, method());
2446     __ call_VM_leaf(
2447          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2448          r15_thread, c_rarg1);
2449     restore_native_result(masm, ret_type, stack_slots);
2450   }
2451 
2452   __ reset_last_Java_frame(false);
2453 
2454   // Unbox oop result, e.g. JNIHandles::resolve value.
2455   if (is_reference_type(ret_type)) {
2456     __ resolve_jobject(rax /* value */,
2457                        r15_thread /* thread */,
2458                        rcx /* tmp */);
2459   }
2460 
2461   if (CheckJNICalls) {
2462     // clear_pending_jni_exception_check
2463     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2464   }
2465 
2466   // reset handle block
2467   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2468   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2469 
2470   // pop our frame
2471 
2472   __ leave();
2473 
2474   // Any exception pending?
2475   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2476   __ jcc(Assembler::notEqual, exception_pending);
2477 
2478   // Return
2479 
2480   __ ret(0);
2481 
2482   // Unexpected paths are out of line and go here
2483 
2484   // forward the exception
2485   __ bind(exception_pending);
2486 
2487   // and forward the exception
2488   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2489 
2490   // Slow path locking & unlocking
2491   if (method->is_synchronized()) {
2492 
2493     // BEGIN Slow path lock
2494     __ bind(slow_path_lock);
2495 
2496     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2497     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2498 
2499     // protect the args we've loaded
2500     save_args(masm, total_c_args, c_arg, out_regs);
2501 
2502     __ mov(c_rarg0, obj_reg);
2503     __ mov(c_rarg1, lock_reg);
2504     __ mov(c_rarg2, r15_thread);
2505 
2506     // Not a leaf but we have last_Java_frame setup as we want
2507     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2508     restore_args(masm, total_c_args, c_arg, out_regs);
2509 
2510 #ifdef ASSERT
2511     { Label L;
2512     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2513     __ jcc(Assembler::equal, L);
2514     __ stop("no pending exception allowed on exit from monitorenter");
2515     __ bind(L);
2516     }
2517 #endif
2518     __ jmp(lock_done);
2519 
2520     // END Slow path lock
2521 
2522     // BEGIN Slow path unlock
2523     __ bind(slow_path_unlock);
2524 
2525     // If we haven't already saved the native result we must save it now as xmm registers
2526     // are still exposed.
2527     __ vzeroupper();
2528     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2529       save_native_result(masm, ret_type, stack_slots);
2530     }
2531 
2532     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2533 
2534     __ mov(c_rarg0, obj_reg);
2535     __ mov(c_rarg2, r15_thread);
2536     __ mov(r12, rsp); // remember sp
2537     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2538     __ andptr(rsp, -16); // align stack as required by ABI
2539 
2540     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2541     // NOTE that obj_reg == rbx currently
2542     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2543     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2544 
2545     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2546     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2547     __ mov(rsp, r12); // restore sp
2548     __ reinit_heapbase();
2549 #ifdef ASSERT
2550     {
2551       Label L;
2552       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2553       __ jcc(Assembler::equal, L);
2554       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2555       __ bind(L);
2556     }
2557 #endif /* ASSERT */
2558 
2559     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2560 
2561     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2562       restore_native_result(masm, ret_type, stack_slots);
2563     }
2564     __ jmp(unlock_done);
2565 
2566     // END Slow path unlock
2567 
2568   } // synchronized
2569 
2570   // SLOW PATH Reguard the stack if needed
2571 
2572   __ bind(reguard);
2573   __ vzeroupper();
2574   save_native_result(masm, ret_type, stack_slots);
2575   __ mov(r12, rsp); // remember sp
2576   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2577   __ andptr(rsp, -16); // align stack as required by ABI
2578   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2579   __ mov(rsp, r12); // restore sp
2580   __ reinit_heapbase();
2581   restore_native_result(masm, ret_type, stack_slots);
2582   // and continue
2583   __ jmp(reguard_done);
2584 
2585 
2586 
2587   __ flush();
2588 
2589   nmethod *nm = nmethod::new_native_nmethod(method,
2590                                             compile_id,
2591                                             masm->code(),
2592                                             vep_offset,
2593                                             frame_complete,
2594                                             stack_slots / VMRegImpl::slots_per_word,
2595                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2596                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2597                                             oop_maps);
2598 
2599   return nm;
2600 }
2601 
2602 // this function returns the adjust size (in number of words) to a c2i adapter
2603 // activation for use during deoptimization
2604 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2605   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2606 }
2607 
2608 
2609 uint SharedRuntime::out_preserve_stack_slots() {
2610   return 0;
2611 }
2612 
2613 
2614 // Number of stack slots between incoming argument block and the start of
2615 // a new frame.  The PROLOG must add this many slots to the stack.  The
2616 // EPILOG must remove this many slots.  amd64 needs two slots for
2617 // return address.
2618 uint SharedRuntime::in_preserve_stack_slots() {
2619   return 4 + 2 * VerifyStackAtCalls;
2620 }
2621 
2622 //------------------------------generate_deopt_blob----------------------------
2623 void SharedRuntime::generate_deopt_blob() {
2624   // Allocate space for the code
2625   ResourceMark rm;
2626   // Setup code generation tools
2627   int pad = 0;
2628   if (UseAVX > 2) {
2629     pad += 1024;
2630   }
2631   if (UseAPX) {
2632     pad += 1024;
2633   }
2634 #if INCLUDE_JVMCI
2635   if (EnableJVMCI) {
2636     pad += 512; // Increase the buffer size when compiling for JVMCI
2637   }
2638 #endif
2639   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2640   MacroAssembler* masm = new MacroAssembler(&buffer);
2641   int frame_size_in_words;
2642   OopMap* map = nullptr;
2643   OopMapSet *oop_maps = new OopMapSet();
2644 
2645   // -------------
2646   // This code enters when returning to a de-optimized nmethod.  A return
2647   // address has been pushed on the stack, and return values are in
2648   // registers.
2649   // If we are doing a normal deopt then we were called from the patched
2650   // nmethod from the point we returned to the nmethod. So the return
2651   // address on the stack is wrong by NativeCall::instruction_size
2652   // We will adjust the value so it looks like we have the original return
2653   // address on the stack (like when we eagerly deoptimized).
2654   // In the case of an exception pending when deoptimizing, we enter
2655   // with a return address on the stack that points after the call we patched
2656   // into the exception handler. We have the following register state from,
2657   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2658   //    rax: exception oop
2659   //    rbx: exception handler
2660   //    rdx: throwing pc
2661   // So in this case we simply jam rdx into the useless return address and
2662   // the stack looks just like we want.
2663   //
2664   // At this point we need to de-opt.  We save the argument return
2665   // registers.  We call the first C routine, fetch_unroll_info().  This
2666   // routine captures the return values and returns a structure which
2667   // describes the current frame size and the sizes of all replacement frames.
2668   // The current frame is compiled code and may contain many inlined
2669   // functions, each with their own JVM state.  We pop the current frame, then
2670   // push all the new frames.  Then we call the C routine unpack_frames() to
2671   // populate these frames.  Finally unpack_frames() returns us the new target
2672   // address.  Notice that callee-save registers are BLOWN here; they have
2673   // already been captured in the vframeArray at the time the return PC was
2674   // patched.
2675   address start = __ pc();
2676   Label cont;
2677 
2678   // Prolog for non exception case!
2679 
2680   // Save everything in sight.
2681   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2682 
2683   // Normal deoptimization.  Save exec mode for unpack_frames.
2684   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2685   __ jmp(cont);
2686 
2687   int reexecute_offset = __ pc() - start;
2688 #if INCLUDE_JVMCI && !defined(COMPILER1)
2689   if (EnableJVMCI && UseJVMCICompiler) {
2690     // JVMCI does not use this kind of deoptimization
2691     __ should_not_reach_here();
2692   }
2693 #endif
2694 
2695   // Reexecute case
2696   // return address is the pc describes what bci to do re-execute at
2697 
2698   // No need to update map as each call to save_live_registers will produce identical oopmap
2699   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2700 
2701   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2702   __ jmp(cont);
2703 
2704 #if INCLUDE_JVMCI
2705   Label after_fetch_unroll_info_call;
2706   int implicit_exception_uncommon_trap_offset = 0;
2707   int uncommon_trap_offset = 0;
2708 
2709   if (EnableJVMCI) {
2710     implicit_exception_uncommon_trap_offset = __ pc() - start;
2711 
2712     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2713     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2714 
2715     uncommon_trap_offset = __ pc() - start;
2716 
2717     // Save everything in sight.
2718     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2719     // fetch_unroll_info needs to call last_java_frame()
2720     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2721 
2722     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2723     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2724 
2725     __ movl(r14, Deoptimization::Unpack_reexecute);
2726     __ mov(c_rarg0, r15_thread);
2727     __ movl(c_rarg2, r14); // exec mode
2728     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2729     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2730 
2731     __ reset_last_Java_frame(false);
2732 
2733     __ jmp(after_fetch_unroll_info_call);
2734   } // EnableJVMCI
2735 #endif // INCLUDE_JVMCI
2736 
2737   int exception_offset = __ pc() - start;
2738 
2739   // Prolog for exception case
2740 
2741   // all registers are dead at this entry point, except for rax, and
2742   // rdx which contain the exception oop and exception pc
2743   // respectively.  Set them in TLS and fall thru to the
2744   // unpack_with_exception_in_tls entry point.
2745 
2746   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2747   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2748 
2749   int exception_in_tls_offset = __ pc() - start;
2750 
2751   // new implementation because exception oop is now passed in JavaThread
2752 
2753   // Prolog for exception case
2754   // All registers must be preserved because they might be used by LinearScan
2755   // Exceptiop oop and throwing PC are passed in JavaThread
2756   // tos: stack at point of call to method that threw the exception (i.e. only
2757   // args are on the stack, no return address)
2758 
2759   // make room on stack for the return address
2760   // It will be patched later with the throwing pc. The correct value is not
2761   // available now because loading it from memory would destroy registers.
2762   __ push(0);
2763 
2764   // Save everything in sight.
2765   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2766 
2767   // Now it is safe to overwrite any register
2768 
2769   // Deopt during an exception.  Save exec mode for unpack_frames.
2770   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2771 
2772   // load throwing pc from JavaThread and patch it as the return address
2773   // of the current frame. Then clear the field in JavaThread
2774 
2775   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2776   __ movptr(Address(rbp, wordSize), rdx);
2777   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2778 
2779 #ifdef ASSERT
2780   // verify that there is really an exception oop in JavaThread
2781   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2782   __ verify_oop(rax);
2783 
2784   // verify that there is no pending exception
2785   Label no_pending_exception;
2786   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2787   __ testptr(rax, rax);
2788   __ jcc(Assembler::zero, no_pending_exception);
2789   __ stop("must not have pending exception here");
2790   __ bind(no_pending_exception);
2791 #endif
2792 
2793   __ bind(cont);
2794 
2795   // Call C code.  Need thread and this frame, but NOT official VM entry
2796   // crud.  We cannot block on this call, no GC can happen.
2797   //
2798   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2799 
2800   // fetch_unroll_info needs to call last_java_frame().
2801 
2802   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2803 #ifdef ASSERT
2804   { Label L;
2805     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2806     __ jcc(Assembler::equal, L);
2807     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2808     __ bind(L);
2809   }
2810 #endif // ASSERT
2811   __ mov(c_rarg0, r15_thread);
2812   __ movl(c_rarg1, r14); // exec_mode
2813   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2814 
2815   // Need to have an oopmap that tells fetch_unroll_info where to
2816   // find any register it might need.
2817   oop_maps->add_gc_map(__ pc() - start, map);
2818 
2819   __ reset_last_Java_frame(false);
2820 
2821 #if INCLUDE_JVMCI
2822   if (EnableJVMCI) {
2823     __ bind(after_fetch_unroll_info_call);
2824   }
2825 #endif
2826 
2827   // Load UnrollBlock* into rdi
2828   __ mov(rdi, rax);
2829 
2830   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2831    Label noException;
2832   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2833   __ jcc(Assembler::notEqual, noException);
2834   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2835   // QQQ this is useless it was null above
2836   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2837   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2838   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2839 
2840   __ verify_oop(rax);
2841 
2842   // Overwrite the result registers with the exception results.
2843   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2844   // I think this is useless
2845   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2846 
2847   __ bind(noException);
2848 
2849   // Only register save data is on the stack.
2850   // Now restore the result registers.  Everything else is either dead
2851   // or captured in the vframeArray.
2852   RegisterSaver::restore_result_registers(masm);
2853 
2854   // All of the register save area has been popped of the stack. Only the
2855   // return address remains.
2856 
2857   // Pop all the frames we must move/replace.
2858   //
2859   // Frame picture (youngest to oldest)
2860   // 1: self-frame (no frame link)
2861   // 2: deopting frame  (no frame link)
2862   // 3: caller of deopting frame (could be compiled/interpreted).
2863   //
2864   // Note: by leaving the return address of self-frame on the stack
2865   // and using the size of frame 2 to adjust the stack
2866   // when we are done the return to frame 3 will still be on the stack.
2867 
2868   // Pop deoptimized frame
2869   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2870   __ addptr(rsp, rcx);
2871 
2872   // rsp should be pointing at the return address to the caller (3)
2873 
2874   // Pick up the initial fp we should save
2875   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2876   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2877 
2878 #ifdef ASSERT
2879   // Compilers generate code that bang the stack by as much as the
2880   // interpreter would need. So this stack banging should never
2881   // trigger a fault. Verify that it does not on non product builds.
2882   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2883   __ bang_stack_size(rbx, rcx);
2884 #endif
2885 
2886   // Load address of array of frame pcs into rcx
2887   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2888 
2889   // Trash the old pc
2890   __ addptr(rsp, wordSize);
2891 
2892   // Load address of array of frame sizes into rsi
2893   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2894 
2895   // Load counter into rdx
2896   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2897 
2898   // Now adjust the caller's stack to make up for the extra locals
2899   // but record the original sp so that we can save it in the skeletal interpreter
2900   // frame and the stack walking of interpreter_sender will get the unextended sp
2901   // value and not the "real" sp value.
2902 
2903   const Register sender_sp = r8;
2904 
2905   __ mov(sender_sp, rsp);
2906   __ movl(rbx, Address(rdi,
2907                        Deoptimization::UnrollBlock::
2908                        caller_adjustment_offset()));
2909   __ subptr(rsp, rbx);
2910 
2911   // Push interpreter frames in a loop
2912   Label loop;
2913   __ bind(loop);
2914   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2915   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2916   __ pushptr(Address(rcx, 0));          // Save return address
2917   __ enter();                           // Save old & set new ebp
2918   __ subptr(rsp, rbx);                  // Prolog
2919   // This value is corrected by layout_activation_impl
2920   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2921   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2922   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2923   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2924   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2925   __ decrementl(rdx);                   // Decrement counter
2926   __ jcc(Assembler::notZero, loop);
2927   __ pushptr(Address(rcx, 0));          // Save final return address
2928 
2929   // Re-push self-frame
2930   __ enter();                           // Save old & set new ebp
2931 
2932   // Allocate a full sized register save area.
2933   // Return address and rbp are in place, so we allocate two less words.
2934   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2935 
2936   // Restore frame locals after moving the frame
2937   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2938   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2939 
2940   // Call C code.  Need thread but NOT official VM entry
2941   // crud.  We cannot block on this call, no GC can happen.  Call should
2942   // restore return values to their stack-slots with the new SP.
2943   //
2944   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2945 
2946   // Use rbp because the frames look interpreted now
2947   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2948   // Don't need the precise return PC here, just precise enough to point into this code blob.
2949   address the_pc = __ pc();
2950   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2951 
2952   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2953   __ mov(c_rarg0, r15_thread);
2954   __ movl(c_rarg1, r14); // second arg: exec_mode
2955   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2956   // Revert SP alignment after call since we're going to do some SP relative addressing below
2957   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2958 
2959   // Set an oopmap for the call site
2960   // Use the same PC we used for the last java frame
2961   oop_maps->add_gc_map(the_pc - start,
2962                        new OopMap( frame_size_in_words, 0 ));
2963 
2964   // Clear fp AND pc
2965   __ reset_last_Java_frame(true);
2966 
2967   // Collect return values
2968   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2969   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2970   // I think this is useless (throwing pc?)
2971   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2972 
2973   // Pop self-frame.
2974   __ leave();                           // Epilog
2975 
2976   // Jump to interpreter
2977   __ ret(0);
2978 
2979   // Make sure all code is generated
2980   masm->flush();
2981 
2982   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2983   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2984 #if INCLUDE_JVMCI
2985   if (EnableJVMCI) {
2986     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2987     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2988   }
2989 #endif
2990 }
2991 
2992 #ifdef COMPILER2
2993 //------------------------------generate_uncommon_trap_blob--------------------
2994 void SharedRuntime::generate_uncommon_trap_blob() {
2995   // Allocate space for the code
2996   ResourceMark rm;
2997   // Setup code generation tools
2998   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2999   MacroAssembler* masm = new MacroAssembler(&buffer);
3000 
3001   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3002 
3003   address start = __ pc();
3004 
3005   // Push self-frame.  We get here with a return address on the
3006   // stack, so rsp is 8-byte aligned until we allocate our frame.
3007   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
3008 
3009   // No callee saved registers. rbp is assumed implicitly saved
3010   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3011 
3012   // compiler left unloaded_class_index in j_rarg0 move to where the
3013   // runtime expects it.
3014   __ movl(c_rarg1, j_rarg0);
3015 
3016   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3017 
3018   // Call C code.  Need thread but NOT official VM entry
3019   // crud.  We cannot block on this call, no GC can happen.  Call should
3020   // capture callee-saved registers as well as return values.
3021   // Thread is in rdi already.
3022   //
3023   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
3024 
3025   __ mov(c_rarg0, r15_thread);
3026   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
3027   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3028 
3029   // Set an oopmap for the call site
3030   OopMapSet* oop_maps = new OopMapSet();
3031   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
3032 
3033   // location of rbp is known implicitly by the frame sender code
3034 
3035   oop_maps->add_gc_map(__ pc() - start, map);
3036 
3037   __ reset_last_Java_frame(false);
3038 
3039   // Load UnrollBlock* into rdi
3040   __ mov(rdi, rax);
3041 
3042 #ifdef ASSERT
3043   { Label L;
3044     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
3045               Deoptimization::Unpack_uncommon_trap);
3046     __ jcc(Assembler::equal, L);
3047     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
3048     __ bind(L);
3049   }
3050 #endif
3051 
3052   // Pop all the frames we must move/replace.
3053   //
3054   // Frame picture (youngest to oldest)
3055   // 1: self-frame (no frame link)
3056   // 2: deopting frame  (no frame link)
3057   // 3: caller of deopting frame (could be compiled/interpreted).
3058 
3059   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
3060   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
3061 
3062   // Pop deoptimized frame (int)
3063   __ movl(rcx, Address(rdi,
3064                        Deoptimization::UnrollBlock::
3065                        size_of_deoptimized_frame_offset()));
3066   __ addptr(rsp, rcx);
3067 
3068   // rsp should be pointing at the return address to the caller (3)
3069 
3070   // Pick up the initial fp we should save
3071   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3072   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3073 
3074 #ifdef ASSERT
3075   // Compilers generate code that bang the stack by as much as the
3076   // interpreter would need. So this stack banging should never
3077   // trigger a fault. Verify that it does not on non product builds.
3078   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3079   __ bang_stack_size(rbx, rcx);
3080 #endif
3081 
3082   // Load address of array of frame pcs into rcx (address*)
3083   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3084 
3085   // Trash the return pc
3086   __ addptr(rsp, wordSize);
3087 
3088   // Load address of array of frame sizes into rsi (intptr_t*)
3089   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
3090 
3091   // Counter
3092   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
3093 
3094   // Now adjust the caller's stack to make up for the extra locals but
3095   // record the original sp so that we can save it in the skeletal
3096   // interpreter frame and the stack walking of interpreter_sender
3097   // will get the unextended sp value and not the "real" sp value.
3098 
3099   const Register sender_sp = r8;
3100 
3101   __ mov(sender_sp, rsp);
3102   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
3103   __ subptr(rsp, rbx);
3104 
3105   // Push interpreter frames in a loop
3106   Label loop;
3107   __ bind(loop);
3108   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3109   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3110   __ pushptr(Address(rcx, 0));     // Save return address
3111   __ enter();                      // Save old & set new rbp
3112   __ subptr(rsp, rbx);             // Prolog
3113   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3114             sender_sp);            // Make it walkable
3115   // This value is corrected by layout_activation_impl
3116   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3117   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3118   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3119   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3120   __ decrementl(rdx);              // Decrement counter
3121   __ jcc(Assembler::notZero, loop);
3122   __ pushptr(Address(rcx, 0));     // Save final return address
3123 
3124   // Re-push self-frame
3125   __ enter();                 // Save old & set new rbp
3126   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3127                               // Prolog
3128 
3129   // Use rbp because the frames look interpreted now
3130   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3131   // Don't need the precise return PC here, just precise enough to point into this code blob.
3132   address the_pc = __ pc();
3133   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3134 
3135   // Call C code.  Need thread but NOT official VM entry
3136   // crud.  We cannot block on this call, no GC can happen.  Call should
3137   // restore return values to their stack-slots with the new SP.
3138   // Thread is in rdi already.
3139   //
3140   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3141 
3142   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3143   __ mov(c_rarg0, r15_thread);
3144   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3145   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3146 
3147   // Set an oopmap for the call site
3148   // Use the same PC we used for the last java frame
3149   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3150 
3151   // Clear fp AND pc
3152   __ reset_last_Java_frame(true);
3153 
3154   // Pop self-frame.
3155   __ leave();                 // Epilog
3156 
3157   // Jump to interpreter
3158   __ ret(0);
3159 
3160   // Make sure all code is generated
3161   masm->flush();
3162 
3163   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3164                                                  SimpleRuntimeFrame::framesize >> 1);
3165 }
3166 #endif // COMPILER2
3167 
3168 //------------------------------generate_handler_blob------
3169 //
3170 // Generate a special Compile2Runtime blob that saves all registers,
3171 // and setup oopmap.
3172 //
3173 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3174   assert(StubRoutines::forward_exception_entry() != nullptr,
3175          "must be generated before");
3176 
3177   ResourceMark rm;
3178   OopMapSet *oop_maps = new OopMapSet();
3179   OopMap* map;
3180 
3181   // Allocate space for the code.  Setup code generation tools.
3182   CodeBuffer buffer("handler_blob", 2348, 1024);
3183   MacroAssembler* masm = new MacroAssembler(&buffer);
3184 
3185   address start   = __ pc();
3186   address call_pc = nullptr;
3187   int frame_size_in_words;
3188   bool cause_return = (poll_type == POLL_AT_RETURN);
3189   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3190 
3191   // Make room for return address (or push it again)
3192   if (!cause_return) {
3193     __ push(rbx);
3194   }
3195 
3196   // Save registers, fpu state, and flags
3197   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3198 
3199   // The following is basically a call_VM.  However, we need the precise
3200   // address of the call in order to generate an oopmap. Hence, we do all the
3201   // work ourselves.
3202 
3203   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3204 
3205   // The return address must always be correct so that frame constructor never
3206   // sees an invalid pc.
3207 
3208   if (!cause_return) {
3209     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3210     // Additionally, rbx is a callee saved register and we can look at it later to determine
3211     // if someone changed the return address for us!
3212     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3213     __ movptr(Address(rbp, wordSize), rbx);
3214   }
3215 
3216   // Do the call
3217   __ mov(c_rarg0, r15_thread);
3218   __ call(RuntimeAddress(call_ptr));
3219 
3220   // Set an oopmap for the call site.  This oopmap will map all
3221   // oop-registers and debug-info registers as callee-saved.  This
3222   // will allow deoptimization at this safepoint to find all possible
3223   // debug-info recordings, as well as let GC find all oops.
3224 
3225   oop_maps->add_gc_map( __ pc() - start, map);
3226 
3227   Label noException;
3228 
3229   __ reset_last_Java_frame(false);
3230 
3231   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3232   __ jcc(Assembler::equal, noException);
3233 
3234   // Exception pending
3235 
3236   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3237 
3238   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3239 
3240   // No exception case
3241   __ bind(noException);
3242 
3243   Label no_adjust;
3244 #ifdef ASSERT
3245   Label bail;
3246 #endif
3247   if (!cause_return) {
3248     Label no_prefix, not_special;
3249 
3250     // If our stashed return pc was modified by the runtime we avoid touching it
3251     __ cmpptr(rbx, Address(rbp, wordSize));
3252     __ jccb(Assembler::notEqual, no_adjust);
3253 
3254     // Skip over the poll instruction.
3255     // See NativeInstruction::is_safepoint_poll()
3256     // Possible encodings:
3257     //      85 00       test   %eax,(%rax)
3258     //      85 01       test   %eax,(%rcx)
3259     //      85 02       test   %eax,(%rdx)
3260     //      85 03       test   %eax,(%rbx)
3261     //      85 06       test   %eax,(%rsi)
3262     //      85 07       test   %eax,(%rdi)
3263     //
3264     //   41 85 00       test   %eax,(%r8)
3265     //   41 85 01       test   %eax,(%r9)
3266     //   41 85 02       test   %eax,(%r10)
3267     //   41 85 03       test   %eax,(%r11)
3268     //   41 85 06       test   %eax,(%r14)
3269     //   41 85 07       test   %eax,(%r15)
3270     //
3271     //      85 04 24    test   %eax,(%rsp)
3272     //   41 85 04 24    test   %eax,(%r12)
3273     //      85 45 00    test   %eax,0x0(%rbp)
3274     //   41 85 45 00    test   %eax,0x0(%r13)
3275 
3276     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3277     __ jcc(Assembler::notEqual, no_prefix);
3278     __ addptr(rbx, 1);
3279     __ bind(no_prefix);
3280 #ifdef ASSERT
3281     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3282 #endif
3283     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3284     // r12/rsp 0x04
3285     // r13/rbp 0x05
3286     __ movzbq(rcx, Address(rbx, 1));
3287     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3288     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3289     __ cmpptr(rcx, 1);
3290     __ jcc(Assembler::above, not_special);
3291     __ addptr(rbx, 1);
3292     __ bind(not_special);
3293 #ifdef ASSERT
3294     // Verify the correct encoding of the poll we're about to skip.
3295     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3296     __ jcc(Assembler::notEqual, bail);
3297     // Mask out the modrm bits
3298     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3299     // rax encodes to 0, so if the bits are nonzero it's incorrect
3300     __ jcc(Assembler::notZero, bail);
3301 #endif
3302     // Adjust return pc forward to step over the safepoint poll instruction
3303     __ addptr(rbx, 2);
3304     __ movptr(Address(rbp, wordSize), rbx);
3305   }
3306 
3307   __ bind(no_adjust);
3308   // Normal exit, restore registers and exit.
3309   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3310   __ ret(0);
3311 
3312 #ifdef ASSERT
3313   __ bind(bail);
3314   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3315 #endif
3316 
3317   // Make sure all code is generated
3318   masm->flush();
3319 
3320   // Fill-out other meta info
3321   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3322 }
3323 
3324 //
3325 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3326 //
3327 // Generate a stub that calls into vm to find out the proper destination
3328 // of a java call. All the argument registers are live at this point
3329 // but since this is generic code we don't know what they are and the caller
3330 // must do any gc of the args.
3331 //
3332 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3333   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3334 
3335   // allocate space for the code
3336   ResourceMark rm;
3337 
3338   CodeBuffer buffer(name, 1552, 512);
3339   MacroAssembler* masm = new MacroAssembler(&buffer);
3340 
3341   int frame_size_in_words;
3342 
3343   OopMapSet *oop_maps = new OopMapSet();
3344   OopMap* map = nullptr;
3345 
3346   int start = __ offset();
3347 
3348   // No need to save vector registers since they are caller-saved anyway.
3349   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3350 
3351   int frame_complete = __ offset();
3352 
3353   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3354 
3355   __ mov(c_rarg0, r15_thread);
3356 
3357   __ call(RuntimeAddress(destination));
3358 
3359 
3360   // Set an oopmap for the call site.
3361   // We need this not only for callee-saved registers, but also for volatile
3362   // registers that the compiler might be keeping live across a safepoint.
3363 
3364   oop_maps->add_gc_map( __ offset() - start, map);
3365 
3366   // rax contains the address we are going to jump to assuming no exception got installed
3367 
3368   // clear last_Java_sp
3369   __ reset_last_Java_frame(false);
3370   // check for pending exceptions
3371   Label pending;
3372   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3373   __ jcc(Assembler::notEqual, pending);
3374 
3375   // get the returned Method*
3376   __ get_vm_result_2(rbx, r15_thread);
3377   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3378 
3379   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3380 
3381   RegisterSaver::restore_live_registers(masm);
3382 
3383   // We are back to the original state on entry and ready to go.
3384 
3385   __ jmp(rax);
3386 
3387   // Pending exception after the safepoint
3388 
3389   __ bind(pending);
3390 
3391   RegisterSaver::restore_live_registers(masm);
3392 
3393   // exception pending => remove activation and forward to exception handler
3394 
3395   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3396 
3397   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3398   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3399 
3400   // -------------
3401   // make sure all code is generated
3402   masm->flush();
3403 
3404   // return the  blob
3405   // frame_size_words or bytes??
3406   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3407 }
3408 
3409 //------------------------------Montgomery multiplication------------------------
3410 //
3411 
3412 #ifndef _WINDOWS
3413 
3414 // Subtract 0:b from carry:a.  Return carry.
3415 static julong
3416 sub(julong a[], julong b[], julong carry, long len) {
3417   long long i = 0, cnt = len;
3418   julong tmp;
3419   asm volatile("clc; "
3420                "0: ; "
3421                "mov (%[b], %[i], 8), %[tmp]; "
3422                "sbb %[tmp], (%[a], %[i], 8); "
3423                "inc %[i]; dec %[cnt]; "
3424                "jne 0b; "
3425                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3426                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3427                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3428                : "memory");
3429   return tmp;
3430 }
3431 
3432 // Multiply (unsigned) Long A by Long B, accumulating the double-
3433 // length result into the accumulator formed of T0, T1, and T2.
3434 #define MACC(A, B, T0, T1, T2)                                  \
3435 do {                                                            \
3436   unsigned long hi, lo;                                         \
3437   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3438            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3439            : "r"(A), "a"(B) : "cc");                            \
3440  } while(0)
3441 
3442 // As above, but add twice the double-length result into the
3443 // accumulator.
3444 #define MACC2(A, B, T0, T1, T2)                                 \
3445 do {                                                            \
3446   unsigned long hi, lo;                                         \
3447   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3448            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3449            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3450            : "r"(A), "a"(B) : "cc");                            \
3451  } while(0)
3452 
3453 #else //_WINDOWS
3454 
3455 static julong
3456 sub(julong a[], julong b[], julong carry, long len) {
3457   long i;
3458   julong tmp;
3459   unsigned char c = 1;
3460   for (i = 0; i < len; i++) {
3461     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3462     a[i] = tmp;
3463   }
3464   c = _addcarry_u64(c, carry, ~0, &tmp);
3465   return tmp;
3466 }
3467 
3468 // Multiply (unsigned) Long A by Long B, accumulating the double-
3469 // length result into the accumulator formed of T0, T1, and T2.
3470 #define MACC(A, B, T0, T1, T2)                          \
3471 do {                                                    \
3472   julong hi, lo;                            \
3473   lo = _umul128(A, B, &hi);                             \
3474   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3475   c = _addcarry_u64(c, hi, T1, &T1);                    \
3476   _addcarry_u64(c, T2, 0, &T2);                         \
3477  } while(0)
3478 
3479 // As above, but add twice the double-length result into the
3480 // accumulator.
3481 #define MACC2(A, B, T0, T1, T2)                         \
3482 do {                                                    \
3483   julong hi, lo;                            \
3484   lo = _umul128(A, B, &hi);                             \
3485   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3486   c = _addcarry_u64(c, hi, T1, &T1);                    \
3487   _addcarry_u64(c, T2, 0, &T2);                         \
3488   c = _addcarry_u64(0, lo, T0, &T0);                    \
3489   c = _addcarry_u64(c, hi, T1, &T1);                    \
3490   _addcarry_u64(c, T2, 0, &T2);                         \
3491  } while(0)
3492 
3493 #endif //_WINDOWS
3494 
3495 // Fast Montgomery multiplication.  The derivation of the algorithm is
3496 // in  A Cryptographic Library for the Motorola DSP56000,
3497 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3498 
3499 static void NOINLINE
3500 montgomery_multiply(julong a[], julong b[], julong n[],
3501                     julong m[], julong inv, int len) {
3502   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3503   int i;
3504 
3505   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3506 
3507   for (i = 0; i < len; i++) {
3508     int j;
3509     for (j = 0; j < i; j++) {
3510       MACC(a[j], b[i-j], t0, t1, t2);
3511       MACC(m[j], n[i-j], t0, t1, t2);
3512     }
3513     MACC(a[i], b[0], t0, t1, t2);
3514     m[i] = t0 * inv;
3515     MACC(m[i], n[0], t0, t1, t2);
3516 
3517     assert(t0 == 0, "broken Montgomery multiply");
3518 
3519     t0 = t1; t1 = t2; t2 = 0;
3520   }
3521 
3522   for (i = len; i < 2*len; i++) {
3523     int j;
3524     for (j = i-len+1; j < len; j++) {
3525       MACC(a[j], b[i-j], t0, t1, t2);
3526       MACC(m[j], n[i-j], t0, t1, t2);
3527     }
3528     m[i-len] = t0;
3529     t0 = t1; t1 = t2; t2 = 0;
3530   }
3531 
3532   while (t0)
3533     t0 = sub(m, n, t0, len);
3534 }
3535 
3536 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3537 // multiplies so it should be up to 25% faster than Montgomery
3538 // multiplication.  However, its loop control is more complex and it
3539 // may actually run slower on some machines.
3540 
3541 static void NOINLINE
3542 montgomery_square(julong a[], julong n[],
3543                   julong m[], julong inv, int len) {
3544   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3545   int i;
3546 
3547   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3548 
3549   for (i = 0; i < len; i++) {
3550     int j;
3551     int end = (i+1)/2;
3552     for (j = 0; j < end; j++) {
3553       MACC2(a[j], a[i-j], t0, t1, t2);
3554       MACC(m[j], n[i-j], t0, t1, t2);
3555     }
3556     if ((i & 1) == 0) {
3557       MACC(a[j], a[j], t0, t1, t2);
3558     }
3559     for (; j < i; j++) {
3560       MACC(m[j], n[i-j], t0, t1, t2);
3561     }
3562     m[i] = t0 * inv;
3563     MACC(m[i], n[0], t0, t1, t2);
3564 
3565     assert(t0 == 0, "broken Montgomery square");
3566 
3567     t0 = t1; t1 = t2; t2 = 0;
3568   }
3569 
3570   for (i = len; i < 2*len; i++) {
3571     int start = i-len+1;
3572     int end = start + (len - start)/2;
3573     int j;
3574     for (j = start; j < end; j++) {
3575       MACC2(a[j], a[i-j], t0, t1, t2);
3576       MACC(m[j], n[i-j], t0, t1, t2);
3577     }
3578     if ((i & 1) == 0) {
3579       MACC(a[j], a[j], t0, t1, t2);
3580     }
3581     for (; j < len; j++) {
3582       MACC(m[j], n[i-j], t0, t1, t2);
3583     }
3584     m[i-len] = t0;
3585     t0 = t1; t1 = t2; t2 = 0;
3586   }
3587 
3588   while (t0)
3589     t0 = sub(m, n, t0, len);
3590 }
3591 
3592 // Swap words in a longword.
3593 static julong swap(julong x) {
3594   return (x << 32) | (x >> 32);
3595 }
3596 
3597 // Copy len longwords from s to d, word-swapping as we go.  The
3598 // destination array is reversed.
3599 static void reverse_words(julong *s, julong *d, int len) {
3600   d += len;
3601   while(len-- > 0) {
3602     d--;
3603     *d = swap(*s);
3604     s++;
3605   }
3606 }
3607 
3608 // The threshold at which squaring is advantageous was determined
3609 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3610 #define MONTGOMERY_SQUARING_THRESHOLD 64
3611 
3612 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3613                                         jint len, jlong inv,
3614                                         jint *m_ints) {
3615   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3616   int longwords = len/2;
3617 
3618   // Make very sure we don't use so much space that the stack might
3619   // overflow.  512 jints corresponds to an 16384-bit integer and
3620   // will use here a total of 8k bytes of stack space.
3621   int divisor = sizeof(julong) * 4;
3622   guarantee(longwords <= 8192 / divisor, "must be");
3623   int total_allocation = longwords * sizeof (julong) * 4;
3624   julong *scratch = (julong *)alloca(total_allocation);
3625 
3626   // Local scratch arrays
3627   julong
3628     *a = scratch + 0 * longwords,
3629     *b = scratch + 1 * longwords,
3630     *n = scratch + 2 * longwords,
3631     *m = scratch + 3 * longwords;
3632 
3633   reverse_words((julong *)a_ints, a, longwords);
3634   reverse_words((julong *)b_ints, b, longwords);
3635   reverse_words((julong *)n_ints, n, longwords);
3636 
3637   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3638 
3639   reverse_words(m, (julong *)m_ints, longwords);
3640 }
3641 
3642 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3643                                       jint len, jlong inv,
3644                                       jint *m_ints) {
3645   assert(len % 2 == 0, "array length in montgomery_square must be even");
3646   int longwords = len/2;
3647 
3648   // Make very sure we don't use so much space that the stack might
3649   // overflow.  512 jints corresponds to an 16384-bit integer and
3650   // will use here a total of 6k bytes of stack space.
3651   int divisor = sizeof(julong) * 3;
3652   guarantee(longwords <= (8192 / divisor), "must be");
3653   int total_allocation = longwords * sizeof (julong) * 3;
3654   julong *scratch = (julong *)alloca(total_allocation);
3655 
3656   // Local scratch arrays
3657   julong
3658     *a = scratch + 0 * longwords,
3659     *n = scratch + 1 * longwords,
3660     *m = scratch + 2 * longwords;
3661 
3662   reverse_words((julong *)a_ints, a, longwords);
3663   reverse_words((julong *)n_ints, n, longwords);
3664 
3665   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3666     ::montgomery_square(a, n, m, (julong)inv, longwords);
3667   } else {
3668     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3669   }
3670 
3671   reverse_words(m, (julong *)m_ints, longwords);
3672 }
3673 
3674 #ifdef COMPILER2
3675 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3676 //
3677 //------------------------------generate_exception_blob---------------------------
3678 // creates exception blob at the end
3679 // Using exception blob, this code is jumped from a compiled method.
3680 // (see emit_exception_handler in x86_64.ad file)
3681 //
3682 // Given an exception pc at a call we call into the runtime for the
3683 // handler in this method. This handler might merely restore state
3684 // (i.e. callee save registers) unwind the frame and jump to the
3685 // exception handler for the nmethod if there is no Java level handler
3686 // for the nmethod.
3687 //
3688 // This code is entered with a jmp.
3689 //
3690 // Arguments:
3691 //   rax: exception oop
3692 //   rdx: exception pc
3693 //
3694 // Results:
3695 //   rax: exception oop
3696 //   rdx: exception pc in caller or ???
3697 //   destination: exception handler of caller
3698 //
3699 // Note: the exception pc MUST be at a call (precise debug information)
3700 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3701 //
3702 
3703 void OptoRuntime::generate_exception_blob() {
3704   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3705   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3706   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3707 
3708   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3709 
3710   // Allocate space for the code
3711   ResourceMark rm;
3712   // Setup code generation tools
3713   CodeBuffer buffer("exception_blob", 2048, 1024);
3714   MacroAssembler* masm = new MacroAssembler(&buffer);
3715 
3716 
3717   address start = __ pc();
3718 
3719   // Exception pc is 'return address' for stack walker
3720   __ push(rdx);
3721   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3722 
3723   // Save callee-saved registers.  See x86_64.ad.
3724 
3725   // rbp is an implicitly saved callee saved register (i.e., the calling
3726   // convention will save/restore it in the prolog/epilog). Other than that
3727   // there are no callee save registers now that adapter frames are gone.
3728 
3729   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3730 
3731   // Store exception in Thread object. We cannot pass any arguments to the
3732   // handle_exception call, since we do not want to make any assumption
3733   // about the size of the frame where the exception happened in.
3734   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3735   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3736   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3737 
3738   // This call does all the hard work.  It checks if an exception handler
3739   // exists in the method.
3740   // If so, it returns the handler address.
3741   // If not, it prepares for stack-unwinding, restoring the callee-save
3742   // registers of the frame being removed.
3743   //
3744   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3745 
3746   // At a method handle call, the stack may not be properly aligned
3747   // when returning with an exception.
3748   address the_pc = __ pc();
3749   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3750   __ mov(c_rarg0, r15_thread);
3751   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3752   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3753 
3754   // Set an oopmap for the call site.  This oopmap will only be used if we
3755   // are unwinding the stack.  Hence, all locations will be dead.
3756   // Callee-saved registers will be the same as the frame above (i.e.,
3757   // handle_exception_stub), since they were restored when we got the
3758   // exception.
3759 
3760   OopMapSet* oop_maps = new OopMapSet();
3761 
3762   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3763 
3764   __ reset_last_Java_frame(false);
3765 
3766   // Restore callee-saved registers
3767 
3768   // rbp is an implicitly saved callee-saved register (i.e., the calling
3769   // convention will save restore it in prolog/epilog) Other than that
3770   // there are no callee save registers now that adapter frames are gone.
3771 
3772   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3773 
3774   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3775   __ pop(rdx);                  // No need for exception pc anymore
3776 
3777   // rax: exception handler
3778 
3779   // We have a handler in rax (could be deopt blob).
3780   __ mov(r8, rax);
3781 
3782   // Get the exception oop
3783   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3784   // Get the exception pc in case we are deoptimized
3785   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3786 #ifdef ASSERT
3787   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3788   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3789 #endif
3790   // Clear the exception oop so GC no longer processes it as a root.
3791   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3792 
3793   // rax: exception oop
3794   // r8:  exception handler
3795   // rdx: exception pc
3796   // Jump to handler
3797 
3798   __ jmp(r8);
3799 
3800   // Make sure all code is generated
3801   masm->flush();
3802 
3803   // Set exception blob
3804   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3805 }
3806 #endif // COMPILER2