1 /*
   2  * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/icBuffer.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/compiledICHolder.hpp"
  46 #include "oops/klass.inline.hpp"
  47 #include "oops/method.inline.hpp"
  48 #include "prims/methodHandles.hpp"
  49 #include "runtime/continuation.hpp"
  50 #include "runtime/continuationEntry.inline.hpp"
  51 #include "runtime/globals.hpp"
  52 #include "runtime/jniHandles.hpp"
  53 #include "runtime/safepointMechanism.hpp"
  54 #include "runtime/sharedRuntime.hpp"
  55 #include "runtime/signature.hpp"
  56 #include "runtime/stubRoutines.hpp"
  57 #include "runtime/vframeArray.hpp"
  58 #include "runtime/vm_version.hpp"
  59 #include "utilities/align.hpp"
  60 #include "utilities/formatBuffer.hpp"
  61 #include "vmreg_x86.inline.hpp"
  62 #ifdef COMPILER1
  63 #include "c1/c1_Runtime1.hpp"
  64 #endif
  65 #ifdef COMPILER2
  66 #include "opto/runtime.hpp"
  67 #endif
  68 #if INCLUDE_JVMCI
  69 #include "jvmci/jvmciJavaClasses.hpp"
  70 #endif
  71 
  72 #define __ masm->
  73 
  74 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  75 
  76 class SimpleRuntimeFrame {
  77 
  78   public:
  79 
  80   // Most of the runtime stubs have this simple frame layout.
  81   // This class exists to make the layout shared in one place.
  82   // Offsets are for compiler stack slots, which are jints.
  83   enum layout {
  84     // The frame sender code expects that rbp will be in the "natural" place and
  85     // will override any oopMap setting for it. We must therefore force the layout
  86     // so that it agrees with the frame sender code.
  87     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  88     rbp_off2,
  89     return_off, return_off2,
  90     framesize
  91   };
  92 };
  93 
  94 class RegisterSaver {
  95   // Capture info about frame layout.  Layout offsets are in jint
  96   // units because compiler frame slots are jints.
  97 #define XSAVE_AREA_BEGIN 160
  98 #define XSAVE_AREA_YMM_BEGIN 576
  99 #define XSAVE_AREA_OPMASK_BEGIN 1088
 100 #define XSAVE_AREA_ZMM_BEGIN 1152
 101 #define XSAVE_AREA_UPPERBANK 1664
 102 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 103 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 104 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 105 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 107   enum layout {
 108     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 109     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 110     DEF_XMM_OFFS(0),
 111     DEF_XMM_OFFS(1),
 112     // 2..15 are implied in range usage
 113     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 114     DEF_YMM_OFFS(0),
 115     DEF_YMM_OFFS(1),
 116     // 2..15 are implied in range usage
 117     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 118     DEF_OPMASK_OFFS(0),
 119     DEF_OPMASK_OFFS(1),
 120     // 2..7 are implied in range usage
 121     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 122     DEF_ZMM_OFFS(0),
 123     DEF_ZMM_OFFS(1),
 124     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 125     DEF_ZMM_UPPER_OFFS(16),
 126     DEF_ZMM_UPPER_OFFS(17),
 127     // 18..31 are implied in range usage
 128     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 129     fpu_stateH_end,
 130     r15_off, r15H_off,
 131     r14_off, r14H_off,
 132     r13_off, r13H_off,
 133     r12_off, r12H_off,
 134     r11_off, r11H_off,
 135     r10_off, r10H_off,
 136     r9_off,  r9H_off,
 137     r8_off,  r8H_off,
 138     rdi_off, rdiH_off,
 139     rsi_off, rsiH_off,
 140     ignore_off, ignoreH_off,  // extra copy of rbp
 141     rsp_off, rspH_off,
 142     rbx_off, rbxH_off,
 143     rdx_off, rdxH_off,
 144     rcx_off, rcxH_off,
 145     rax_off, raxH_off,
 146     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 147     align_off, alignH_off,
 148     flags_off, flagsH_off,
 149     // The frame sender code expects that rbp will be in the "natural" place and
 150     // will override any oopMap setting for it. We must therefore force the layout
 151     // so that it agrees with the frame sender code.
 152     rbp_off, rbpH_off,        // copy of rbp we will restore
 153     return_off, returnH_off,  // slot for return address
 154     reg_save_size             // size in compiler stack slots
 155   };
 156 
 157  public:
 158   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 159   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 160 
 161   // Offsets into the register save area
 162   // Used by deoptimization when it is managing result register
 163   // values on its own
 164 
 165   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 166   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 167   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 168   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 169   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 170 
 171   // During deoptimization only the result registers need to be restored,
 172   // all the other values have already been extracted.
 173   static void restore_result_registers(MacroAssembler* masm);
 174 };
 175 
 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 177   int off = 0;
 178   int num_xmm_regs = XMMRegister::available_xmm_registers();
 179 #if COMPILER2_OR_JVMCI
 180   if (save_wide_vectors && UseAVX == 0) {
 181     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 182   }
 183   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 184 #else
 185   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 186 #endif
 187 
 188   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 189   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 190   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 191   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 192   // CodeBlob frame size is in words.
 193   int frame_size_in_words = frame_size_in_bytes / wordSize;
 194   *total_frame_words = frame_size_in_words;
 195 
 196   // Save registers, fpu state, and flags.
 197   // We assume caller has already pushed the return address onto the
 198   // stack, so rsp is 8-byte aligned here.
 199   // We push rpb twice in this sequence because we want the real rbp
 200   // to be under the return like a normal enter.
 201 
 202   __ enter();          // rsp becomes 16-byte aligned here
 203   __ push_CPU_state(); // Push a multiple of 16 bytes
 204 
 205   // push cpu state handles this on EVEX enabled targets
 206   if (save_wide_vectors) {
 207     // Save upper half of YMM registers(0..15)
 208     int base_addr = XSAVE_AREA_YMM_BEGIN;
 209     for (int n = 0; n < 16; n++) {
 210       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 211     }
 212     if (VM_Version::supports_evex()) {
 213       // Save upper half of ZMM registers(0..15)
 214       base_addr = XSAVE_AREA_ZMM_BEGIN;
 215       for (int n = 0; n < 16; n++) {
 216         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 217       }
 218       // Save full ZMM registers(16..num_xmm_regs)
 219       base_addr = XSAVE_AREA_UPPERBANK;
 220       off = 0;
 221       int vector_len = Assembler::AVX_512bit;
 222       for (int n = 16; n < num_xmm_regs; n++) {
 223         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 224       }
 225 #if COMPILER2_OR_JVMCI
 226       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 227       off = 0;
 228       for(int n = 0; n < KRegister::number_of_registers; n++) {
 229         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 230       }
 231 #endif
 232     }
 233   } else {
 234     if (VM_Version::supports_evex()) {
 235       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 236       int base_addr = XSAVE_AREA_UPPERBANK;
 237       off = 0;
 238       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 239       for (int n = 16; n < num_xmm_regs; n++) {
 240         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 241       }
 242 #if COMPILER2_OR_JVMCI
 243       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 244       off = 0;
 245       for(int n = 0; n < KRegister::number_of_registers; n++) {
 246         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 247       }
 248 #endif
 249     }
 250   }
 251   __ vzeroupper();
 252   if (frame::arg_reg_save_area_bytes != 0) {
 253     // Allocate argument register save area
 254     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 255   }
 256 
 257   // Set an oopmap for the call site.  This oopmap will map all
 258   // oop-registers and debug-info registers as callee-saved.  This
 259   // will allow deoptimization at this safepoint to find all possible
 260   // debug-info recordings, as well as let GC find all oops.
 261 
 262   OopMapSet *oop_maps = new OopMapSet();
 263   OopMap* map = new OopMap(frame_size_in_slots, 0);
 264 
 265 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 266 
 267   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 271   // rbp location is known implicitly by the frame sender code, needs no oopmap
 272   // and the location where rbp was saved by is ignored
 273   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 283   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 284   // on EVEX enabled targets, we get it included in the xsave area
 285   off = xmm0_off;
 286   int delta = xmm1_off - off;
 287   for (int n = 0; n < 16; n++) {
 288     XMMRegister xmm_name = as_XMMRegister(n);
 289     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 290     off += delta;
 291   }
 292   if (UseAVX > 2) {
 293     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 294     off = zmm16_off;
 295     delta = zmm17_off - off;
 296     for (int n = 16; n < num_xmm_regs; n++) {
 297       XMMRegister zmm_name = as_XMMRegister(n);
 298       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 299       off += delta;
 300     }
 301   }
 302 
 303 #if COMPILER2_OR_JVMCI
 304   if (save_wide_vectors) {
 305     // Save upper half of YMM registers(0..15)
 306     off = ymm0_off;
 307     delta = ymm1_off - ymm0_off;
 308     for (int n = 0; n < 16; n++) {
 309       XMMRegister ymm_name = as_XMMRegister(n);
 310       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 311       off += delta;
 312     }
 313     if (VM_Version::supports_evex()) {
 314       // Save upper half of ZMM registers(0..15)
 315       off = zmm0_off;
 316       delta = zmm1_off - zmm0_off;
 317       for (int n = 0; n < 16; n++) {
 318         XMMRegister zmm_name = as_XMMRegister(n);
 319         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 320         off += delta;
 321       }
 322     }
 323   }
 324 #endif // COMPILER2_OR_JVMCI
 325 
 326   // %%% These should all be a waste but we'll keep things as they were for now
 327   if (true) {
 328     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 332     // rbp location is known implicitly by the frame sender code, needs no oopmap
 333     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 343     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 344     // on EVEX enabled targets, we get it included in the xsave area
 345     off = xmm0H_off;
 346     delta = xmm1H_off - off;
 347     for (int n = 0; n < 16; n++) {
 348       XMMRegister xmm_name = as_XMMRegister(n);
 349       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 350       off += delta;
 351     }
 352     if (UseAVX > 2) {
 353       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 354       off = zmm16H_off;
 355       delta = zmm17H_off - off;
 356       for (int n = 16; n < num_xmm_regs; n++) {
 357         XMMRegister zmm_name = as_XMMRegister(n);
 358         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 359         off += delta;
 360       }
 361     }
 362   }
 363 
 364   return map;
 365 }
 366 
 367 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 368   int num_xmm_regs = XMMRegister::available_xmm_registers();
 369   if (frame::arg_reg_save_area_bytes != 0) {
 370     // Pop arg register save area
 371     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 372   }
 373 
 374 #if COMPILER2_OR_JVMCI
 375   if (restore_wide_vectors) {
 376     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 377     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 378   }
 379 #else
 380   assert(!restore_wide_vectors, "vectors are generated only by C2");
 381 #endif
 382 
 383   __ vzeroupper();
 384 
 385   // On EVEX enabled targets everything is handled in pop fpu state
 386   if (restore_wide_vectors) {
 387     // Restore upper half of YMM registers (0..15)
 388     int base_addr = XSAVE_AREA_YMM_BEGIN;
 389     for (int n = 0; n < 16; n++) {
 390       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 391     }
 392     if (VM_Version::supports_evex()) {
 393       // Restore upper half of ZMM registers (0..15)
 394       base_addr = XSAVE_AREA_ZMM_BEGIN;
 395       for (int n = 0; n < 16; n++) {
 396         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 397       }
 398       // Restore full ZMM registers(16..num_xmm_regs)
 399       base_addr = XSAVE_AREA_UPPERBANK;
 400       int vector_len = Assembler::AVX_512bit;
 401       int off = 0;
 402       for (int n = 16; n < num_xmm_regs; n++) {
 403         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 404       }
 405 #if COMPILER2_OR_JVMCI
 406       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 407       off = 0;
 408       for (int n = 0; n < KRegister::number_of_registers; n++) {
 409         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 410       }
 411 #endif
 412     }
 413   } else {
 414     if (VM_Version::supports_evex()) {
 415       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 416       int base_addr = XSAVE_AREA_UPPERBANK;
 417       int off = 0;
 418       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 419       for (int n = 16; n < num_xmm_regs; n++) {
 420         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 421       }
 422 #if COMPILER2_OR_JVMCI
 423       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 424       off = 0;
 425       for (int n = 0; n < KRegister::number_of_registers; n++) {
 426         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 427       }
 428 #endif
 429     }
 430   }
 431 
 432   // Recover CPU state
 433   __ pop_CPU_state();
 434   // Get the rbp described implicitly by the calling convention (no oopMap)
 435   __ pop(rbp);
 436 }
 437 
 438 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 439 
 440   // Just restore result register. Only used by deoptimization. By
 441   // now any callee save register that needs to be restored to a c2
 442   // caller of the deoptee has been extracted into the vframeArray
 443   // and will be stuffed into the c2i adapter we create for later
 444   // restoration so only result registers need to be restored here.
 445 
 446   // Restore fp result register
 447   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 448   // Restore integer result register
 449   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 450   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 451 
 452   // Pop all of the register save are off the stack except the return address
 453   __ addptr(rsp, return_offset_in_bytes());
 454 }
 455 
 456 // Is vector's size (in bytes) bigger than a size saved by default?
 457 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 458 bool SharedRuntime::is_wide_vector(int size) {
 459   return size > 16;
 460 }
 461 
 462 // ---------------------------------------------------------------------------
 463 // Read the array of BasicTypes from a signature, and compute where the
 464 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 465 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 466 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 467 // as framesizes are fixed.
 468 // VMRegImpl::stack0 refers to the first slot 0(sp).
 469 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 470 // Register up to Register::number_of_registers are the 64-bit
 471 // integer registers.
 472 
 473 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 474 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 475 // units regardless of build. Of course for i486 there is no 64 bit build
 476 
 477 // The Java calling convention is a "shifted" version of the C ABI.
 478 // By skipping the first C ABI register we can call non-static jni methods
 479 // with small numbers of arguments without having to shuffle the arguments
 480 // at all. Since we control the java ABI we ought to at least get some
 481 // advantage out of it.
 482 
 483 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 484                                            VMRegPair *regs,
 485                                            int total_args_passed) {
 486 
 487   // Create the mapping between argument positions and
 488   // registers.
 489   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 490     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 491   };
 492   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 493     j_farg0, j_farg1, j_farg2, j_farg3,
 494     j_farg4, j_farg5, j_farg6, j_farg7
 495   };
 496 
 497 
 498   uint int_args = 0;
 499   uint fp_args = 0;
 500   uint stk_args = 0;
 501 
 502   for (int i = 0; i < total_args_passed; i++) {
 503     switch (sig_bt[i]) {
 504     case T_BOOLEAN:
 505     case T_CHAR:
 506     case T_BYTE:
 507     case T_SHORT:
 508     case T_INT:
 509       if (int_args < Argument::n_int_register_parameters_j) {
 510         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 511       } else {
 512         stk_args = align_up(stk_args, 2);
 513         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 514         stk_args += 1;
 515       }
 516       break;
 517     case T_VOID:
 518       // halves of T_LONG or T_DOUBLE
 519       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 520       regs[i].set_bad();
 521       break;
 522     case T_LONG:
 523       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 524       // fall through
 525     case T_OBJECT:
 526     case T_ARRAY:
 527     case T_ADDRESS:
 528       if (int_args < Argument::n_int_register_parameters_j) {
 529         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 530       } else {
 531         stk_args = align_up(stk_args, 2);
 532         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 533         stk_args += 2;
 534       }
 535       break;
 536     case T_FLOAT:
 537       if (fp_args < Argument::n_float_register_parameters_j) {
 538         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 539       } else {
 540         stk_args = align_up(stk_args, 2);
 541         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 542         stk_args += 1;
 543       }
 544       break;
 545     case T_DOUBLE:
 546       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 547       if (fp_args < Argument::n_float_register_parameters_j) {
 548         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 549       } else {
 550         stk_args = align_up(stk_args, 2);
 551         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 552         stk_args += 2;
 553       }
 554       break;
 555     default:
 556       ShouldNotReachHere();
 557       break;
 558     }
 559   }
 560 
 561   return stk_args;
 562 }
 563 
 564 // Patch the callers callsite with entry to compiled code if it exists.
 565 static void patch_callers_callsite(MacroAssembler *masm) {
 566   Label L;
 567   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 568   __ jcc(Assembler::equal, L);
 569 
 570   // Save the current stack pointer
 571   __ mov(r13, rsp);
 572   // Schedule the branch target address early.
 573   // Call into the VM to patch the caller, then jump to compiled callee
 574   // rax isn't live so capture return address while we easily can
 575   __ movptr(rax, Address(rsp, 0));
 576 
 577   // align stack so push_CPU_state doesn't fault
 578   __ andptr(rsp, -(StackAlignmentInBytes));
 579   __ push_CPU_state();
 580   __ vzeroupper();
 581   // VM needs caller's callsite
 582   // VM needs target method
 583   // This needs to be a long call since we will relocate this adapter to
 584   // the codeBuffer and it may not reach
 585 
 586   // Allocate argument register save area
 587   if (frame::arg_reg_save_area_bytes != 0) {
 588     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 589   }
 590   __ mov(c_rarg0, rbx);
 591   __ mov(c_rarg1, rax);
 592   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 593 
 594   // De-allocate argument register save area
 595   if (frame::arg_reg_save_area_bytes != 0) {
 596     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 597   }
 598 
 599   __ vzeroupper();
 600   __ pop_CPU_state();
 601   // restore sp
 602   __ mov(rsp, r13);
 603   __ bind(L);
 604 }
 605 
 606 
 607 static void gen_c2i_adapter(MacroAssembler *masm,
 608                             int total_args_passed,
 609                             int comp_args_on_stack,
 610                             const BasicType *sig_bt,
 611                             const VMRegPair *regs,
 612                             Label& skip_fixup) {
 613   // Before we get into the guts of the C2I adapter, see if we should be here
 614   // at all.  We've come from compiled code and are attempting to jump to the
 615   // interpreter, which means the caller made a static call to get here
 616   // (vcalls always get a compiled target if there is one).  Check for a
 617   // compiled target.  If there is one, we need to patch the caller's call.
 618   patch_callers_callsite(masm);
 619 
 620   __ bind(skip_fixup);
 621 
 622   // Since all args are passed on the stack, total_args_passed *
 623   // Interpreter::stackElementSize is the space we need.
 624 
 625   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 626 
 627   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 628 
 629   // stack is aligned, keep it that way
 630   // This is not currently needed or enforced by the interpreter, but
 631   // we might as well conform to the ABI.
 632   extraspace = align_up(extraspace, 2*wordSize);
 633 
 634   // set senderSP value
 635   __ lea(r13, Address(rsp, wordSize));
 636 
 637 #ifdef ASSERT
 638   __ check_stack_alignment(r13, "sender stack not aligned");
 639 #endif
 640   if (extraspace > 0) {
 641     // Pop the return address
 642     __ pop(rax);
 643 
 644     __ subptr(rsp, extraspace);
 645 
 646     // Push the return address
 647     __ push(rax);
 648 
 649     // Account for the return address location since we store it first rather
 650     // than hold it in a register across all the shuffling
 651     extraspace += wordSize;
 652   }
 653 
 654 #ifdef ASSERT
 655   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 656 #endif
 657 
 658   // Now write the args into the outgoing interpreter space
 659   for (int i = 0; i < total_args_passed; i++) {
 660     if (sig_bt[i] == T_VOID) {
 661       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 662       continue;
 663     }
 664 
 665     // offset to start parameters
 666     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 667     int next_off = st_off - Interpreter::stackElementSize;
 668 
 669     // Say 4 args:
 670     // i   st_off
 671     // 0   32 T_LONG
 672     // 1   24 T_VOID
 673     // 2   16 T_OBJECT
 674     // 3    8 T_BOOL
 675     // -    0 return address
 676     //
 677     // However to make thing extra confusing. Because we can fit a long/double in
 678     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 679     // leaves one slot empty and only stores to a single slot. In this case the
 680     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 681 
 682     VMReg r_1 = regs[i].first();
 683     VMReg r_2 = regs[i].second();
 684     if (!r_1->is_valid()) {
 685       assert(!r_2->is_valid(), "");
 686       continue;
 687     }
 688     if (r_1->is_stack()) {
 689       // memory to memory use rax
 690       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 691       if (!r_2->is_valid()) {
 692         // sign extend??
 693         __ movl(rax, Address(rsp, ld_off));
 694         __ movptr(Address(rsp, st_off), rax);
 695 
 696       } else {
 697 
 698         __ movq(rax, Address(rsp, ld_off));
 699 
 700         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 701         // T_DOUBLE and T_LONG use two slots in the interpreter
 702         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 703           // ld_off == LSW, ld_off+wordSize == MSW
 704           // st_off == MSW, next_off == LSW
 705           __ movq(Address(rsp, next_off), rax);
 706 #ifdef ASSERT
 707           // Overwrite the unused slot with known junk
 708           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 709           __ movptr(Address(rsp, st_off), rax);
 710 #endif /* ASSERT */
 711         } else {
 712           __ movq(Address(rsp, st_off), rax);
 713         }
 714       }
 715     } else if (r_1->is_Register()) {
 716       Register r = r_1->as_Register();
 717       if (!r_2->is_valid()) {
 718         // must be only an int (or less ) so move only 32bits to slot
 719         // why not sign extend??
 720         __ movl(Address(rsp, st_off), r);
 721       } else {
 722         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 723         // T_DOUBLE and T_LONG use two slots in the interpreter
 724         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 725           // long/double in gpr
 726 #ifdef ASSERT
 727           // Overwrite the unused slot with known junk
 728           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 729           __ movptr(Address(rsp, st_off), rax);
 730 #endif /* ASSERT */
 731           __ movq(Address(rsp, next_off), r);
 732         } else {
 733           __ movptr(Address(rsp, st_off), r);
 734         }
 735       }
 736     } else {
 737       assert(r_1->is_XMMRegister(), "");
 738       if (!r_2->is_valid()) {
 739         // only a float use just part of the slot
 740         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 741       } else {
 742 #ifdef ASSERT
 743         // Overwrite the unused slot with known junk
 744         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 745         __ movptr(Address(rsp, st_off), rax);
 746 #endif /* ASSERT */
 747         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 748       }
 749     }
 750   }
 751 
 752   // Schedule the branch target address early.
 753   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 754   __ jmp(rcx);
 755 }
 756 
 757 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 758                         address code_start, address code_end,
 759                         Label& L_ok) {
 760   Label L_fail;
 761   __ lea(temp_reg, ExternalAddress(code_start));
 762   __ cmpptr(pc_reg, temp_reg);
 763   __ jcc(Assembler::belowEqual, L_fail);
 764   __ lea(temp_reg, ExternalAddress(code_end));
 765   __ cmpptr(pc_reg, temp_reg);
 766   __ jcc(Assembler::below, L_ok);
 767   __ bind(L_fail);
 768 }
 769 
 770 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 771                                     int total_args_passed,
 772                                     int comp_args_on_stack,
 773                                     const BasicType *sig_bt,
 774                                     const VMRegPair *regs) {
 775 
 776   // Note: r13 contains the senderSP on entry. We must preserve it since
 777   // we may do a i2c -> c2i transition if we lose a race where compiled
 778   // code goes non-entrant while we get args ready.
 779   // In addition we use r13 to locate all the interpreter args as
 780   // we must align the stack to 16 bytes on an i2c entry else we
 781   // lose alignment we expect in all compiled code and register
 782   // save code can segv when fxsave instructions find improperly
 783   // aligned stack pointer.
 784 
 785   // Adapters can be frameless because they do not require the caller
 786   // to perform additional cleanup work, such as correcting the stack pointer.
 787   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 788   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 789   // even if a callee has modified the stack pointer.
 790   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 791   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 792   // up via the senderSP register).
 793   // In other words, if *either* the caller or callee is interpreted, we can
 794   // get the stack pointer repaired after a call.
 795   // This is why c2i and i2c adapters cannot be indefinitely composed.
 796   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 797   // both caller and callee would be compiled methods, and neither would
 798   // clean up the stack pointer changes performed by the two adapters.
 799   // If this happens, control eventually transfers back to the compiled
 800   // caller, but with an uncorrected stack, causing delayed havoc.
 801 
 802   if (VerifyAdapterCalls &&
 803       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 804     // So, let's test for cascading c2i/i2c adapters right now.
 805     //  assert(Interpreter::contains($return_addr) ||
 806     //         StubRoutines::contains($return_addr),
 807     //         "i2c adapter must return to an interpreter frame");
 808     __ block_comment("verify_i2c { ");
 809     // Pick up the return address
 810     __ movptr(rax, Address(rsp, 0));
 811     Label L_ok;
 812     if (Interpreter::code() != nullptr) {
 813       range_check(masm, rax, r11,
 814                   Interpreter::code()->code_start(),
 815                   Interpreter::code()->code_end(),
 816                   L_ok);
 817     }
 818     if (StubRoutines::initial_stubs_code() != nullptr) {
 819       range_check(masm, rax, r11,
 820                   StubRoutines::initial_stubs_code()->code_begin(),
 821                   StubRoutines::initial_stubs_code()->code_end(),
 822                   L_ok);
 823     }
 824     if (StubRoutines::final_stubs_code() != nullptr) {
 825       range_check(masm, rax, r11,
 826                   StubRoutines::final_stubs_code()->code_begin(),
 827                   StubRoutines::final_stubs_code()->code_end(),
 828                   L_ok);
 829     }
 830     const char* msg = "i2c adapter must return to an interpreter frame";
 831     __ block_comment(msg);
 832     __ stop(msg);
 833     __ bind(L_ok);
 834     __ block_comment("} verify_i2ce ");
 835   }
 836 
 837   // Must preserve original SP for loading incoming arguments because
 838   // we need to align the outgoing SP for compiled code.
 839   __ movptr(r11, rsp);
 840 
 841   // Pick up the return address
 842   __ pop(rax);
 843 
 844   // Convert 4-byte c2 stack slots to words.
 845   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 846 
 847   if (comp_args_on_stack) {
 848     __ subptr(rsp, comp_words_on_stack * wordSize);
 849   }
 850 
 851   // Ensure compiled code always sees stack at proper alignment
 852   __ andptr(rsp, -16);
 853 
 854   // push the return address and misalign the stack that youngest frame always sees
 855   // as far as the placement of the call instruction
 856   __ push(rax);
 857 
 858   // Put saved SP in another register
 859   const Register saved_sp = rax;
 860   __ movptr(saved_sp, r11);
 861 
 862   // Will jump to the compiled code just as if compiled code was doing it.
 863   // Pre-load the register-jump target early, to schedule it better.
 864   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 865 
 866 #if INCLUDE_JVMCI
 867   if (EnableJVMCI) {
 868     // check if this call should be routed towards a specific entry point
 869     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 870     Label no_alternative_target;
 871     __ jcc(Assembler::equal, no_alternative_target);
 872     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 873     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 874     __ bind(no_alternative_target);
 875   }
 876 #endif // INCLUDE_JVMCI
 877 
 878   // Now generate the shuffle code.  Pick up all register args and move the
 879   // rest through the floating point stack top.
 880   for (int i = 0; i < total_args_passed; i++) {
 881     if (sig_bt[i] == T_VOID) {
 882       // Longs and doubles are passed in native word order, but misaligned
 883       // in the 32-bit build.
 884       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 885       continue;
 886     }
 887 
 888     // Pick up 0, 1 or 2 words from SP+offset.
 889 
 890     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 891             "scrambled load targets?");
 892     // Load in argument order going down.
 893     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 894     // Point to interpreter value (vs. tag)
 895     int next_off = ld_off - Interpreter::stackElementSize;
 896     //
 897     //
 898     //
 899     VMReg r_1 = regs[i].first();
 900     VMReg r_2 = regs[i].second();
 901     if (!r_1->is_valid()) {
 902       assert(!r_2->is_valid(), "");
 903       continue;
 904     }
 905     if (r_1->is_stack()) {
 906       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 907       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 908 
 909       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 910       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 911       // will be generated.
 912       if (!r_2->is_valid()) {
 913         // sign extend???
 914         __ movl(r13, Address(saved_sp, ld_off));
 915         __ movptr(Address(rsp, st_off), r13);
 916       } else {
 917         //
 918         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 919         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 920         // So we must adjust where to pick up the data to match the interpreter.
 921         //
 922         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 923         // are accessed as negative so LSW is at LOW address
 924 
 925         // ld_off is MSW so get LSW
 926         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 927                            next_off : ld_off;
 928         __ movq(r13, Address(saved_sp, offset));
 929         // st_off is LSW (i.e. reg.first())
 930         __ movq(Address(rsp, st_off), r13);
 931       }
 932     } else if (r_1->is_Register()) {  // Register argument
 933       Register r = r_1->as_Register();
 934       assert(r != rax, "must be different");
 935       if (r_2->is_valid()) {
 936         //
 937         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 938         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 939         // So we must adjust where to pick up the data to match the interpreter.
 940 
 941         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 942                            next_off : ld_off;
 943 
 944         // this can be a misaligned move
 945         __ movq(r, Address(saved_sp, offset));
 946       } else {
 947         // sign extend and use a full word?
 948         __ movl(r, Address(saved_sp, ld_off));
 949       }
 950     } else {
 951       if (!r_2->is_valid()) {
 952         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 953       } else {
 954         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 955       }
 956     }
 957   }
 958 
 959   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 960 
 961   // 6243940 We might end up in handle_wrong_method if
 962   // the callee is deoptimized as we race thru here. If that
 963   // happens we don't want to take a safepoint because the
 964   // caller frame will look interpreted and arguments are now
 965   // "compiled" so it is much better to make this transition
 966   // invisible to the stack walking code. Unfortunately if
 967   // we try and find the callee by normal means a safepoint
 968   // is possible. So we stash the desired callee in the thread
 969   // and the vm will find there should this case occur.
 970 
 971   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 972 
 973   // put Method* where a c2i would expect should we end up there
 974   // only needed because eof c2 resolve stubs return Method* as a result in
 975   // rax
 976   __ mov(rax, rbx);
 977   __ jmp(r11);
 978 }
 979 
 980 // ---------------------------------------------------------------
 981 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 982                                                             int total_args_passed,
 983                                                             int comp_args_on_stack,
 984                                                             const BasicType *sig_bt,
 985                                                             const VMRegPair *regs,
 986                                                             AdapterFingerPrint* fingerprint) {
 987   address i2c_entry = __ pc();
 988 
 989   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 990 
 991   // -------------------------------------------------------------------------
 992   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 993   // to the interpreter.  The args start out packed in the compiled layout.  They
 994   // need to be unpacked into the interpreter layout.  This will almost always
 995   // require some stack space.  We grow the current (compiled) stack, then repack
 996   // the args.  We  finally end in a jump to the generic interpreter entry point.
 997   // On exit from the interpreter, the interpreter will restore our SP (lest the
 998   // compiled code, which relies solely on SP and not RBP, get sick).
 999 
1000   address c2i_unverified_entry = __ pc();
1001   Label skip_fixup;
1002   Label ok;
1003 
1004   Register holder = rax;
1005   Register receiver = j_rarg0;
1006   Register temp = rbx;
1007 
1008   {
1009     __ load_klass(temp, receiver, rscratch1);
1010     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
1011     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
1012     __ jcc(Assembler::equal, ok);
1013     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1014 
1015     __ bind(ok);
1016     // Method might have been compiled since the call site was patched to
1017     // interpreted if that is the case treat it as a miss so we can get
1018     // the call site corrected.
1019     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1020     __ jcc(Assembler::equal, skip_fixup);
1021     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1022   }
1023 
1024   address c2i_entry = __ pc();
1025 
1026   // Class initialization barrier for static methods
1027   address c2i_no_clinit_check_entry = nullptr;
1028   if (VM_Version::supports_fast_class_init_checks()) {
1029     Label L_skip_barrier;
1030     Register method = rbx;
1031 
1032     { // Bypass the barrier for non-static methods
1033       Register flags = rscratch1;
1034       __ movl(flags, Address(method, Method::access_flags_offset()));
1035       __ testl(flags, JVM_ACC_STATIC);
1036       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1037     }
1038 
1039     Register klass = rscratch1;
1040     __ load_method_holder(klass, method);
1041     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1042 
1043     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1044 
1045     __ bind(L_skip_barrier);
1046     c2i_no_clinit_check_entry = __ pc();
1047   }
1048 
1049   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1050   bs->c2i_entry_barrier(masm);
1051 
1052   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1053 
1054   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1055 }
1056 
1057 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1058                                          VMRegPair *regs,
1059                                          VMRegPair *regs2,
1060                                          int total_args_passed) {
1061   assert(regs2 == nullptr, "not needed on x86");
1062 // We return the amount of VMRegImpl stack slots we need to reserve for all
1063 // the arguments NOT counting out_preserve_stack_slots.
1064 
1065 // NOTE: These arrays will have to change when c1 is ported
1066 #ifdef _WIN64
1067     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1068       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1069     };
1070     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1071       c_farg0, c_farg1, c_farg2, c_farg3
1072     };
1073 #else
1074     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1075       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1076     };
1077     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1078       c_farg0, c_farg1, c_farg2, c_farg3,
1079       c_farg4, c_farg5, c_farg6, c_farg7
1080     };
1081 #endif // _WIN64
1082 
1083 
1084     uint int_args = 0;
1085     uint fp_args = 0;
1086     uint stk_args = 0; // inc by 2 each time
1087 
1088     for (int i = 0; i < total_args_passed; i++) {
1089       switch (sig_bt[i]) {
1090       case T_BOOLEAN:
1091       case T_CHAR:
1092       case T_BYTE:
1093       case T_SHORT:
1094       case T_INT:
1095         if (int_args < Argument::n_int_register_parameters_c) {
1096           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1097 #ifdef _WIN64
1098           fp_args++;
1099           // Allocate slots for callee to stuff register args the stack.
1100           stk_args += 2;
1101 #endif
1102         } else {
1103           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1104           stk_args += 2;
1105         }
1106         break;
1107       case T_LONG:
1108         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1109         // fall through
1110       case T_OBJECT:
1111       case T_ARRAY:
1112       case T_ADDRESS:
1113       case T_METADATA:
1114         if (int_args < Argument::n_int_register_parameters_c) {
1115           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1116 #ifdef _WIN64
1117           fp_args++;
1118           stk_args += 2;
1119 #endif
1120         } else {
1121           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1122           stk_args += 2;
1123         }
1124         break;
1125       case T_FLOAT:
1126         if (fp_args < Argument::n_float_register_parameters_c) {
1127           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1128 #ifdef _WIN64
1129           int_args++;
1130           // Allocate slots for callee to stuff register args the stack.
1131           stk_args += 2;
1132 #endif
1133         } else {
1134           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1135           stk_args += 2;
1136         }
1137         break;
1138       case T_DOUBLE:
1139         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1140         if (fp_args < Argument::n_float_register_parameters_c) {
1141           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1142 #ifdef _WIN64
1143           int_args++;
1144           // Allocate slots for callee to stuff register args the stack.
1145           stk_args += 2;
1146 #endif
1147         } else {
1148           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1149           stk_args += 2;
1150         }
1151         break;
1152       case T_VOID: // Halves of longs and doubles
1153         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1154         regs[i].set_bad();
1155         break;
1156       default:
1157         ShouldNotReachHere();
1158         break;
1159       }
1160     }
1161 #ifdef _WIN64
1162   // windows abi requires that we always allocate enough stack space
1163   // for 4 64bit registers to be stored down.
1164   if (stk_args < 8) {
1165     stk_args = 8;
1166   }
1167 #endif // _WIN64
1168 
1169   return stk_args;
1170 }
1171 
1172 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1173                                              uint num_bits,
1174                                              uint total_args_passed) {
1175   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1176          "only certain vector sizes are supported for now");
1177 
1178   static const XMMRegister VEC_ArgReg[32] = {
1179      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1180      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1181     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1182     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1183   };
1184 
1185   uint stk_args = 0;
1186   uint fp_args = 0;
1187 
1188   for (uint i = 0; i < total_args_passed; i++) {
1189     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1190     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1191     regs[i].set_pair(vmreg->next(next_val), vmreg);
1192   }
1193 
1194   return stk_args;
1195 }
1196 
1197 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1198   // We always ignore the frame_slots arg and just use the space just below frame pointer
1199   // which by this time is free to use
1200   switch (ret_type) {
1201   case T_FLOAT:
1202     __ movflt(Address(rbp, -wordSize), xmm0);
1203     break;
1204   case T_DOUBLE:
1205     __ movdbl(Address(rbp, -wordSize), xmm0);
1206     break;
1207   case T_VOID:  break;
1208   default: {
1209     __ movptr(Address(rbp, -wordSize), rax);
1210     }
1211   }
1212 }
1213 
1214 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1215   // We always ignore the frame_slots arg and just use the space just below frame pointer
1216   // which by this time is free to use
1217   switch (ret_type) {
1218   case T_FLOAT:
1219     __ movflt(xmm0, Address(rbp, -wordSize));
1220     break;
1221   case T_DOUBLE:
1222     __ movdbl(xmm0, Address(rbp, -wordSize));
1223     break;
1224   case T_VOID:  break;
1225   default: {
1226     __ movptr(rax, Address(rbp, -wordSize));
1227     }
1228   }
1229 }
1230 
1231 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1232     for ( int i = first_arg ; i < arg_count ; i++ ) {
1233       if (args[i].first()->is_Register()) {
1234         __ push(args[i].first()->as_Register());
1235       } else if (args[i].first()->is_XMMRegister()) {
1236         __ subptr(rsp, 2*wordSize);
1237         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1238       }
1239     }
1240 }
1241 
1242 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1243     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1244       if (args[i].first()->is_Register()) {
1245         __ pop(args[i].first()->as_Register());
1246       } else if (args[i].first()->is_XMMRegister()) {
1247         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1248         __ addptr(rsp, 2*wordSize);
1249       }
1250     }
1251 }
1252 
1253 static void verify_oop_args(MacroAssembler* masm,
1254                             const methodHandle& method,
1255                             const BasicType* sig_bt,
1256                             const VMRegPair* regs) {
1257   Register temp_reg = rbx;  // not part of any compiled calling seq
1258   if (VerifyOops) {
1259     for (int i = 0; i < method->size_of_parameters(); i++) {
1260       if (is_reference_type(sig_bt[i])) {
1261         VMReg r = regs[i].first();
1262         assert(r->is_valid(), "bad oop arg");
1263         if (r->is_stack()) {
1264           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1265           __ verify_oop(temp_reg);
1266         } else {
1267           __ verify_oop(r->as_Register());
1268         }
1269       }
1270     }
1271   }
1272 }
1273 
1274 static void check_continuation_enter_argument(VMReg actual_vmreg,
1275                                               Register expected_reg,
1276                                               const char* name) {
1277   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1278   assert(actual_vmreg->as_Register() == expected_reg,
1279          "%s is in unexpected register: %s instead of %s",
1280          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1281 }
1282 
1283 
1284 //---------------------------- continuation_enter_setup ---------------------------
1285 //
1286 // Arguments:
1287 //   None.
1288 //
1289 // Results:
1290 //   rsp: pointer to blank ContinuationEntry
1291 //
1292 // Kills:
1293 //   rax
1294 //
1295 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1296   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1297   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1298   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1299 
1300   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1301   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1302 
1303   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1304   OopMap* map = new OopMap(frame_size, 0);
1305 
1306   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1307   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1308   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1309 
1310   return map;
1311 }
1312 
1313 //---------------------------- fill_continuation_entry ---------------------------
1314 //
1315 // Arguments:
1316 //   rsp: pointer to blank Continuation entry
1317 //   reg_cont_obj: pointer to the continuation
1318 //   reg_flags: flags
1319 //
1320 // Results:
1321 //   rsp: pointer to filled out ContinuationEntry
1322 //
1323 // Kills:
1324 //   rax
1325 //
1326 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1327   assert_different_registers(rax, reg_cont_obj, reg_flags);
1328 #ifdef ASSERT
1329   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1330 #endif
1331   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1332   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1333   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1334   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1335   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1336 
1337   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1338   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1339   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1340   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1341 
1342   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1343   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1344 }
1345 
1346 //---------------------------- continuation_enter_cleanup ---------------------------
1347 //
1348 // Arguments:
1349 //   rsp: pointer to the ContinuationEntry
1350 //
1351 // Results:
1352 //   rsp: pointer to the spilled rbp in the entry frame
1353 //
1354 // Kills:
1355 //   rbx
1356 //
1357 void static continuation_enter_cleanup(MacroAssembler* masm) {
1358 #ifdef ASSERT
1359   Label L_good_sp;
1360   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1361   __ jcc(Assembler::equal, L_good_sp);
1362   __ stop("Incorrect rsp at continuation_enter_cleanup");
1363   __ bind(L_good_sp);
1364 #endif
1365 
1366   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1367   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1368   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1369   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1370 
1371   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1372   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1373   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1374 }
1375 
1376 static void gen_continuation_enter(MacroAssembler* masm,
1377                                    const VMRegPair* regs,
1378                                    int& exception_offset,
1379                                    OopMapSet* oop_maps,
1380                                    int& frame_complete,
1381                                    int& stack_slots,
1382                                    int& interpreted_entry_offset,
1383                                    int& compiled_entry_offset) {
1384 
1385   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1386   int pos_cont_obj   = 0;
1387   int pos_is_cont    = 1;
1388   int pos_is_virtual = 2;
1389 
1390   // The platform-specific calling convention may present the arguments in various registers.
1391   // To simplify the rest of the code, we expect the arguments to reside at these known
1392   // registers, and we additionally check the placement here in case calling convention ever
1393   // changes.
1394   Register reg_cont_obj   = c_rarg1;
1395   Register reg_is_cont    = c_rarg2;
1396   Register reg_is_virtual = c_rarg3;
1397 
1398   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1399   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1400   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1401 
1402   // Utility methods kill rax, make sure there are no collisions
1403   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1404 
1405   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1406                          relocInfo::static_call_type);
1407 
1408   address start = __ pc();
1409 
1410   Label L_thaw, L_exit;
1411 
1412   // i2i entry used at interp_only_mode only
1413   interpreted_entry_offset = __ pc() - start;
1414   {
1415 #ifdef ASSERT
1416     Label is_interp_only;
1417     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1418     __ jcc(Assembler::notEqual, is_interp_only);
1419     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1420     __ bind(is_interp_only);
1421 #endif
1422 
1423     __ pop(rax); // return address
1424     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1425     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1426     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1427     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1428     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1429     __ push(rax); // return address
1430     __ push_cont_fastpath();
1431 
1432     __ enter();
1433 
1434     stack_slots = 2; // will be adjusted in setup
1435     OopMap* map = continuation_enter_setup(masm, stack_slots);
1436     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1437     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1438 
1439     __ verify_oop(reg_cont_obj);
1440 
1441     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1442 
1443     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1444     __ testptr(reg_is_cont, reg_is_cont);
1445     __ jcc(Assembler::notZero, L_thaw);
1446 
1447     // --- Resolve path
1448 
1449     // Make sure the call is patchable
1450     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1451     // Emit stub for static call
1452     CodeBuffer* cbuf = masm->code_section()->outer();
1453     address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1454     if (stub == nullptr) {
1455       fatal("CodeCache is full at gen_continuation_enter");
1456     }
1457     __ call(resolve);
1458     oop_maps->add_gc_map(__ pc() - start, map);
1459     __ post_call_nop();
1460 
1461     __ jmp(L_exit);
1462   }
1463 
1464   // compiled entry
1465   __ align(CodeEntryAlignment);
1466   compiled_entry_offset = __ pc() - start;
1467   __ enter();
1468 
1469   stack_slots = 2; // will be adjusted in setup
1470   OopMap* map = continuation_enter_setup(masm, stack_slots);
1471 
1472   // Frame is now completed as far as size and linkage.
1473   frame_complete = __ pc() - start;
1474 
1475   __ verify_oop(reg_cont_obj);
1476 
1477   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1478 
1479   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1480   __ testptr(reg_is_cont, reg_is_cont);
1481   __ jccb(Assembler::notZero, L_thaw);
1482 
1483   // --- call Continuation.enter(Continuation c, boolean isContinue)
1484 
1485   // Make sure the call is patchable
1486   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1487 
1488   // Emit stub for static call
1489   CodeBuffer* cbuf = masm->code_section()->outer();
1490   address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1491   if (stub == nullptr) {
1492     fatal("CodeCache is full at gen_continuation_enter");
1493   }
1494 
1495   // The call needs to be resolved. There's a special case for this in
1496   // SharedRuntime::find_callee_info_helper() which calls
1497   // LinkResolver::resolve_continuation_enter() which resolves the call to
1498   // Continuation.enter(Continuation c, boolean isContinue).
1499   __ call(resolve);
1500 
1501   oop_maps->add_gc_map(__ pc() - start, map);
1502   __ post_call_nop();
1503 
1504   __ jmpb(L_exit);
1505 
1506   // --- Thawing path
1507 
1508   __ bind(L_thaw);
1509 
1510   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1511 
1512   ContinuationEntry::_return_pc_offset = __ pc() - start;
1513   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1514   __ post_call_nop();
1515 
1516   // --- Normal exit (resolve/thawing)
1517 
1518   __ bind(L_exit);
1519 
1520   continuation_enter_cleanup(masm);
1521   __ pop(rbp);
1522   __ ret(0);
1523 
1524   // --- Exception handling path
1525 
1526   exception_offset = __ pc() - start;
1527 
1528   continuation_enter_cleanup(masm);
1529   __ pop(rbp);
1530 
1531   __ movptr(c_rarg0, r15_thread);
1532   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1533 
1534   // rax still holds the original exception oop, save it before the call
1535   __ push(rax);
1536 
1537   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1538   __ movptr(rbx, rax);
1539 
1540   // Continue at exception handler:
1541   //   rax: exception oop
1542   //   rbx: exception handler
1543   //   rdx: exception pc
1544   __ pop(rax);
1545   __ verify_oop(rax);
1546   __ pop(rdx);
1547   __ jmp(rbx);
1548 }
1549 
1550 static void gen_continuation_yield(MacroAssembler* masm,
1551                                    const VMRegPair* regs,
1552                                    OopMapSet* oop_maps,
1553                                    int& frame_complete,
1554                                    int& stack_slots,
1555                                    int& compiled_entry_offset) {
1556   enum layout {
1557     rbp_off,
1558     rbpH_off,
1559     return_off,
1560     return_off2,
1561     framesize // inclusive of return address
1562   };
1563   stack_slots = framesize /  VMRegImpl::slots_per_word;
1564   assert(stack_slots == 2, "recheck layout");
1565 
1566   address start = __ pc();
1567   compiled_entry_offset = __ pc() - start;
1568   __ enter();
1569   address the_pc = __ pc();
1570 
1571   frame_complete = the_pc - start;
1572 
1573   // This nop must be exactly at the PC we push into the frame info.
1574   // We use this nop for fast CodeBlob lookup, associate the OopMap
1575   // with it right away.
1576   __ post_call_nop();
1577   OopMap* map = new OopMap(framesize, 1);
1578   oop_maps->add_gc_map(frame_complete, map);
1579 
1580   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1581   __ movptr(c_rarg0, r15_thread);
1582   __ movptr(c_rarg1, rsp);
1583   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1584   __ reset_last_Java_frame(true);
1585 
1586   Label L_pinned;
1587 
1588   __ testptr(rax, rax);
1589   __ jcc(Assembler::notZero, L_pinned);
1590 
1591   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1592   continuation_enter_cleanup(masm);
1593   __ pop(rbp);
1594   __ ret(0);
1595 
1596   __ bind(L_pinned);
1597 
1598   // Pinned, return to caller
1599 
1600   // handle pending exception thrown by freeze
1601   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1602   Label ok;
1603   __ jcc(Assembler::equal, ok);
1604   __ leave();
1605   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1606   __ bind(ok);
1607 
1608   __ leave();
1609   __ ret(0);
1610 }
1611 
1612 static void gen_special_dispatch(MacroAssembler* masm,
1613                                  const methodHandle& method,
1614                                  const BasicType* sig_bt,
1615                                  const VMRegPair* regs) {
1616   verify_oop_args(masm, method, sig_bt, regs);
1617   vmIntrinsics::ID iid = method->intrinsic_id();
1618 
1619   // Now write the args into the outgoing interpreter space
1620   bool     has_receiver   = false;
1621   Register receiver_reg   = noreg;
1622   int      member_arg_pos = -1;
1623   Register member_reg     = noreg;
1624   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1625   if (ref_kind != 0) {
1626     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1627     member_reg = rbx;  // known to be free at this point
1628     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1629   } else if (iid == vmIntrinsics::_invokeBasic) {
1630     has_receiver = true;
1631   } else if (iid == vmIntrinsics::_linkToNative) {
1632     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1633     member_reg = rbx;  // known to be free at this point
1634   } else {
1635     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1636   }
1637 
1638   if (member_reg != noreg) {
1639     // Load the member_arg into register, if necessary.
1640     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1641     VMReg r = regs[member_arg_pos].first();
1642     if (r->is_stack()) {
1643       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1644     } else {
1645       // no data motion is needed
1646       member_reg = r->as_Register();
1647     }
1648   }
1649 
1650   if (has_receiver) {
1651     // Make sure the receiver is loaded into a register.
1652     assert(method->size_of_parameters() > 0, "oob");
1653     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1654     VMReg r = regs[0].first();
1655     assert(r->is_valid(), "bad receiver arg");
1656     if (r->is_stack()) {
1657       // Porting note:  This assumes that compiled calling conventions always
1658       // pass the receiver oop in a register.  If this is not true on some
1659       // platform, pick a temp and load the receiver from stack.
1660       fatal("receiver always in a register");
1661       receiver_reg = j_rarg0;  // known to be free at this point
1662       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1663     } else {
1664       // no data motion is needed
1665       receiver_reg = r->as_Register();
1666     }
1667   }
1668 
1669   // Figure out which address we are really jumping to:
1670   MethodHandles::generate_method_handle_dispatch(masm, iid,
1671                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1672 }
1673 
1674 // ---------------------------------------------------------------------------
1675 // Generate a native wrapper for a given method.  The method takes arguments
1676 // in the Java compiled code convention, marshals them to the native
1677 // convention (handlizes oops, etc), transitions to native, makes the call,
1678 // returns to java state (possibly blocking), unhandlizes any result and
1679 // returns.
1680 //
1681 // Critical native functions are a shorthand for the use of
1682 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1683 // functions.  The wrapper is expected to unpack the arguments before
1684 // passing them to the callee. Critical native functions leave the state _in_Java,
1685 // since they cannot stop for GC.
1686 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1687 // block and the check for pending exceptions it's impossible for them
1688 // to be thrown.
1689 //
1690 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1691                                                 const methodHandle& method,
1692                                                 int compile_id,
1693                                                 BasicType* in_sig_bt,
1694                                                 VMRegPair* in_regs,
1695                                                 BasicType ret_type) {
1696   if (method->is_continuation_native_intrinsic()) {
1697     int exception_offset = -1;
1698     OopMapSet* oop_maps = new OopMapSet();
1699     int frame_complete = -1;
1700     int stack_slots = -1;
1701     int interpreted_entry_offset = -1;
1702     int vep_offset = -1;
1703     if (method->is_continuation_enter_intrinsic()) {
1704       gen_continuation_enter(masm,
1705                              in_regs,
1706                              exception_offset,
1707                              oop_maps,
1708                              frame_complete,
1709                              stack_slots,
1710                              interpreted_entry_offset,
1711                              vep_offset);
1712     } else if (method->is_continuation_yield_intrinsic()) {
1713       gen_continuation_yield(masm,
1714                              in_regs,
1715                              oop_maps,
1716                              frame_complete,
1717                              stack_slots,
1718                              vep_offset);
1719     } else {
1720       guarantee(false, "Unknown Continuation native intrinsic");
1721     }
1722 
1723 #ifdef ASSERT
1724     if (method->is_continuation_enter_intrinsic()) {
1725       assert(interpreted_entry_offset != -1, "Must be set");
1726       assert(exception_offset != -1,         "Must be set");
1727     } else {
1728       assert(interpreted_entry_offset == -1, "Must be unset");
1729       assert(exception_offset == -1,         "Must be unset");
1730     }
1731     assert(frame_complete != -1,    "Must be set");
1732     assert(stack_slots != -1,       "Must be set");
1733     assert(vep_offset != -1,        "Must be set");
1734 #endif
1735 
1736     __ flush();
1737     nmethod* nm = nmethod::new_native_nmethod(method,
1738                                               compile_id,
1739                                               masm->code(),
1740                                               vep_offset,
1741                                               frame_complete,
1742                                               stack_slots,
1743                                               in_ByteSize(-1),
1744                                               in_ByteSize(-1),
1745                                               oop_maps,
1746                                               exception_offset);
1747     if (method->is_continuation_enter_intrinsic()) {
1748       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1749     } else if (method->is_continuation_yield_intrinsic()) {
1750       _cont_doYield_stub = nm;
1751     }
1752     return nm;
1753   }
1754 
1755   if (method->is_method_handle_intrinsic()) {
1756     vmIntrinsics::ID iid = method->intrinsic_id();
1757     intptr_t start = (intptr_t)__ pc();
1758     int vep_offset = ((intptr_t)__ pc()) - start;
1759     gen_special_dispatch(masm,
1760                          method,
1761                          in_sig_bt,
1762                          in_regs);
1763     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1764     __ flush();
1765     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1766     return nmethod::new_native_nmethod(method,
1767                                        compile_id,
1768                                        masm->code(),
1769                                        vep_offset,
1770                                        frame_complete,
1771                                        stack_slots / VMRegImpl::slots_per_word,
1772                                        in_ByteSize(-1),
1773                                        in_ByteSize(-1),
1774                                        nullptr);
1775   }
1776   address native_func = method->native_function();
1777   assert(native_func != nullptr, "must have function");
1778 
1779   // An OopMap for lock (and class if static)
1780   OopMapSet *oop_maps = new OopMapSet();
1781   intptr_t start = (intptr_t)__ pc();
1782 
1783   // We have received a description of where all the java arg are located
1784   // on entry to the wrapper. We need to convert these args to where
1785   // the jni function will expect them. To figure out where they go
1786   // we convert the java signature to a C signature by inserting
1787   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1788 
1789   const int total_in_args = method->size_of_parameters();
1790   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1791 
1792   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1793   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1794   BasicType* in_elem_bt = nullptr;
1795 
1796   int argc = 0;
1797   out_sig_bt[argc++] = T_ADDRESS;
1798   if (method->is_static()) {
1799     out_sig_bt[argc++] = T_OBJECT;
1800   }
1801 
1802   for (int i = 0; i < total_in_args ; i++ ) {
1803     out_sig_bt[argc++] = in_sig_bt[i];
1804   }
1805 
1806   // Now figure out where the args must be stored and how much stack space
1807   // they require.
1808   int out_arg_slots;
1809   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, nullptr, total_c_args);
1810 
1811   // Compute framesize for the wrapper.  We need to handlize all oops in
1812   // incoming registers
1813 
1814   // Calculate the total number of stack slots we will need.
1815 
1816   // First count the abi requirement plus all of the outgoing args
1817   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1818 
1819   // Now the space for the inbound oop handle area
1820   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1821 
1822   int oop_handle_offset = stack_slots;
1823   stack_slots += total_save_slots;
1824 
1825   // Now any space we need for handlizing a klass if static method
1826 
1827   int klass_slot_offset = 0;
1828   int klass_offset = -1;
1829   int lock_slot_offset = 0;
1830   bool is_static = false;
1831 
1832   if (method->is_static()) {
1833     klass_slot_offset = stack_slots;
1834     stack_slots += VMRegImpl::slots_per_word;
1835     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1836     is_static = true;
1837   }
1838 
1839   // Plus a lock if needed
1840 
1841   if (method->is_synchronized()) {
1842     lock_slot_offset = stack_slots;
1843     stack_slots += VMRegImpl::slots_per_word;
1844   }
1845 
1846   // Now a place (+2) to save return values or temp during shuffling
1847   // + 4 for return address (which we own) and saved rbp
1848   stack_slots += 6;
1849 
1850   // Ok The space we have allocated will look like:
1851   //
1852   //
1853   // FP-> |                     |
1854   //      |---------------------|
1855   //      | 2 slots for moves   |
1856   //      |---------------------|
1857   //      | lock box (if sync)  |
1858   //      |---------------------| <- lock_slot_offset
1859   //      | klass (if static)   |
1860   //      |---------------------| <- klass_slot_offset
1861   //      | oopHandle area      |
1862   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1863   //      | outbound memory     |
1864   //      | based arguments     |
1865   //      |                     |
1866   //      |---------------------|
1867   //      |                     |
1868   // SP-> | out_preserved_slots |
1869   //
1870   //
1871 
1872 
1873   // Now compute actual number of stack words we need rounding to make
1874   // stack properly aligned.
1875   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1876 
1877   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1878 
1879   // First thing make an ic check to see if we should even be here
1880 
1881   // We are free to use all registers as temps without saving them and
1882   // restoring them except rbp. rbp is the only callee save register
1883   // as far as the interpreter and the compiler(s) are concerned.
1884 
1885 
1886   const Register ic_reg = rax;
1887   const Register receiver = j_rarg0;
1888 
1889   Label hit;
1890   Label exception_pending;
1891 
1892   assert_different_registers(ic_reg, receiver, rscratch1, rscratch2);
1893   __ verify_oop(receiver);
1894   __ load_klass(rscratch1, receiver, rscratch2);
1895   __ cmpq(ic_reg, rscratch1);
1896   __ jcc(Assembler::equal, hit);
1897 
1898   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1899 
1900   // Verified entry point must be aligned
1901   __ align(8);
1902 
1903   __ bind(hit);
1904 
1905   int vep_offset = ((intptr_t)__ pc()) - start;
1906 
1907   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1908     Label L_skip_barrier;
1909     Register klass = r10;
1910     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1911     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1912 
1913     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1914 
1915     __ bind(L_skip_barrier);
1916   }
1917 
1918 #ifdef COMPILER1
1919   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1920   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1921     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1922   }
1923 #endif // COMPILER1
1924 
1925   // The instruction at the verified entry point must be 5 bytes or longer
1926   // because it can be patched on the fly by make_non_entrant. The stack bang
1927   // instruction fits that requirement.
1928 
1929   // Generate stack overflow check
1930   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1931 
1932   // Generate a new frame for the wrapper.
1933   __ enter();
1934   // -2 because return address is already present and so is saved rbp
1935   __ subptr(rsp, stack_size - 2*wordSize);
1936 
1937   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1938   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1939   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1940 
1941   // Frame is now completed as far as size and linkage.
1942   int frame_complete = ((intptr_t)__ pc()) - start;
1943 
1944     if (UseRTMLocking) {
1945       // Abort RTM transaction before calling JNI
1946       // because critical section will be large and will be
1947       // aborted anyway. Also nmethod could be deoptimized.
1948       __ xabort(0);
1949     }
1950 
1951 #ifdef ASSERT
1952   __ check_stack_alignment(rsp, "improperly aligned stack");
1953 #endif /* ASSERT */
1954 
1955 
1956   // We use r14 as the oop handle for the receiver/klass
1957   // It is callee save so it survives the call to native
1958 
1959   const Register oop_handle_reg = r14;
1960 
1961   //
1962   // We immediately shuffle the arguments so that any vm call we have to
1963   // make from here on out (sync slow path, jvmti, etc.) we will have
1964   // captured the oops from our caller and have a valid oopMap for
1965   // them.
1966 
1967   // -----------------
1968   // The Grand Shuffle
1969 
1970   // The Java calling convention is either equal (linux) or denser (win64) than the
1971   // c calling convention. However the because of the jni_env argument the c calling
1972   // convention always has at least one more (and two for static) arguments than Java.
1973   // Therefore if we move the args from java -> c backwards then we will never have
1974   // a register->register conflict and we don't have to build a dependency graph
1975   // and figure out how to break any cycles.
1976   //
1977 
1978   // Record esp-based slot for receiver on stack for non-static methods
1979   int receiver_offset = -1;
1980 
1981   // This is a trick. We double the stack slots so we can claim
1982   // the oops in the caller's frame. Since we are sure to have
1983   // more args than the caller doubling is enough to make
1984   // sure we can capture all the incoming oop args from the
1985   // caller.
1986   //
1987   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1988 
1989   // Mark location of rbp (someday)
1990   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1991 
1992   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1993   // All inbound args are referenced based on rbp and all outbound args via rsp.
1994 
1995 
1996 #ifdef ASSERT
1997   bool reg_destroyed[Register::number_of_registers];
1998   bool freg_destroyed[XMMRegister::number_of_registers];
1999   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2000     reg_destroyed[r] = false;
2001   }
2002   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2003     freg_destroyed[f] = false;
2004   }
2005 
2006 #endif /* ASSERT */
2007 
2008   // For JNI natives the incoming and outgoing registers are offset upwards.
2009   GrowableArray<int> arg_order(2 * total_in_args);
2010 
2011   VMRegPair tmp_vmreg;
2012   tmp_vmreg.set2(rbx->as_VMReg());
2013 
2014   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2015     arg_order.push(i);
2016     arg_order.push(c_arg);
2017   }
2018 
2019   int temploc = -1;
2020   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2021     int i = arg_order.at(ai);
2022     int c_arg = arg_order.at(ai + 1);
2023     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2024 #ifdef ASSERT
2025     if (in_regs[i].first()->is_Register()) {
2026       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2027     } else if (in_regs[i].first()->is_XMMRegister()) {
2028       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2029     }
2030     if (out_regs[c_arg].first()->is_Register()) {
2031       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2032     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2033       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2034     }
2035 #endif /* ASSERT */
2036     switch (in_sig_bt[i]) {
2037       case T_ARRAY:
2038       case T_OBJECT:
2039         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2040                     ((i == 0) && (!is_static)),
2041                     &receiver_offset);
2042         break;
2043       case T_VOID:
2044         break;
2045 
2046       case T_FLOAT:
2047         __ float_move(in_regs[i], out_regs[c_arg]);
2048           break;
2049 
2050       case T_DOUBLE:
2051         assert( i + 1 < total_in_args &&
2052                 in_sig_bt[i + 1] == T_VOID &&
2053                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2054         __ double_move(in_regs[i], out_regs[c_arg]);
2055         break;
2056 
2057       case T_LONG :
2058         __ long_move(in_regs[i], out_regs[c_arg]);
2059         break;
2060 
2061       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2062 
2063       default:
2064         __ move32_64(in_regs[i], out_regs[c_arg]);
2065     }
2066   }
2067 
2068   int c_arg;
2069 
2070   // Pre-load a static method's oop into r14.  Used both by locking code and
2071   // the normal JNI call code.
2072   // point c_arg at the first arg that is already loaded in case we
2073   // need to spill before we call out
2074   c_arg = total_c_args - total_in_args;
2075 
2076   if (method->is_static()) {
2077 
2078     //  load oop into a register
2079     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2080 
2081     // Now handlize the static class mirror it's known not-null.
2082     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2083     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2084 
2085     // Now get the handle
2086     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2087     // store the klass handle as second argument
2088     __ movptr(c_rarg1, oop_handle_reg);
2089     // and protect the arg if we must spill
2090     c_arg--;
2091   }
2092 
2093   // Change state to native (we save the return address in the thread, since it might not
2094   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2095   // points into the right code segment. It does not have to be the correct return pc.
2096   // We use the same pc/oopMap repeatedly when we call out
2097 
2098   intptr_t the_pc = (intptr_t) __ pc();
2099   oop_maps->add_gc_map(the_pc - start, map);
2100 
2101   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2102 
2103 
2104   // We have all of the arguments setup at this point. We must not touch any register
2105   // argument registers at this point (what if we save/restore them there are no oop?
2106 
2107   {
2108     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2109     // protect the args we've loaded
2110     save_args(masm, total_c_args, c_arg, out_regs);
2111     __ mov_metadata(c_rarg1, method());
2112     __ call_VM_leaf(
2113       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2114       r15_thread, c_rarg1);
2115     restore_args(masm, total_c_args, c_arg, out_regs);
2116   }
2117 
2118   // RedefineClasses() tracing support for obsolete method entry
2119   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2120     // protect the args we've loaded
2121     save_args(masm, total_c_args, c_arg, out_regs);
2122     __ mov_metadata(c_rarg1, method());
2123     __ call_VM_leaf(
2124       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2125       r15_thread, c_rarg1);
2126     restore_args(masm, total_c_args, c_arg, out_regs);
2127   }
2128 
2129   // Lock a synchronized method
2130 
2131   // Register definitions used by locking and unlocking
2132 
2133   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2134   const Register obj_reg  = rbx;  // Will contain the oop
2135   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2136   const Register old_hdr  = r13;  // value of old header at unlock time
2137 
2138   Label slow_path_lock;
2139   Label lock_done;
2140 
2141   if (method->is_synchronized()) {
2142     Label count_mon;
2143 
2144     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2145 
2146     // Get the handle (the 2nd argument)
2147     __ mov(oop_handle_reg, c_rarg1);
2148 
2149     // Get address of the box
2150 
2151     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2152 
2153     // Load the oop from the handle
2154     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2155 
2156     if (LockingMode == LM_MONITOR) {
2157       __ jmp(slow_path_lock);
2158     } else if (LockingMode == LM_LEGACY) {
2159       // Load immediate 1 into swap_reg %rax
2160       __ movl(swap_reg, 1);
2161 
2162       // Load (object->mark() | 1) into swap_reg %rax
2163       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2164 
2165       // Save (object->mark() | 1) into BasicLock's displaced header
2166       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2167 
2168       // src -> dest iff dest == rax else rax <- dest
2169       __ lock();
2170       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2171       __ jcc(Assembler::equal, count_mon);
2172 
2173       // Hmm should this move to the slow path code area???
2174 
2175       // Test if the oopMark is an obvious stack pointer, i.e.,
2176       //  1) (mark & 3) == 0, and
2177       //  2) rsp <= mark < mark + os::pagesize()
2178       // These 3 tests can be done by evaluating the following
2179       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2180       // assuming both stack pointer and pagesize have their
2181       // least significant 2 bits clear.
2182       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2183 
2184       __ subptr(swap_reg, rsp);
2185       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2186 
2187       // Save the test result, for recursive case, the result is zero
2188       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2189       __ jcc(Assembler::notEqual, slow_path_lock);
2190     } else {
2191       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2192       // Load object header
2193       __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2194       __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2195     }
2196     __ bind(count_mon);
2197     __ inc_held_monitor_count();
2198 
2199     // Slow path will re-enter here
2200     __ bind(lock_done);
2201   }
2202 
2203   // Finally just about ready to make the JNI call
2204 
2205   // get JNIEnv* which is first argument to native
2206   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2207 
2208   // Now set thread in native
2209   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2210 
2211   __ call(RuntimeAddress(native_func));
2212 
2213   // Verify or restore cpu control state after JNI call
2214   __ restore_cpu_control_state_after_jni(rscratch1);
2215 
2216   // Unpack native results.
2217   switch (ret_type) {
2218   case T_BOOLEAN: __ c2bool(rax);            break;
2219   case T_CHAR   : __ movzwl(rax, rax);      break;
2220   case T_BYTE   : __ sign_extend_byte (rax); break;
2221   case T_SHORT  : __ sign_extend_short(rax); break;
2222   case T_INT    : /* nothing to do */        break;
2223   case T_DOUBLE :
2224   case T_FLOAT  :
2225     // Result is in xmm0 we'll save as needed
2226     break;
2227   case T_ARRAY:                 // Really a handle
2228   case T_OBJECT:                // Really a handle
2229       break; // can't de-handlize until after safepoint check
2230   case T_VOID: break;
2231   case T_LONG: break;
2232   default       : ShouldNotReachHere();
2233   }
2234 
2235   Label after_transition;
2236 
2237   // Switch thread to "native transition" state before reading the synchronization state.
2238   // This additional state is necessary because reading and testing the synchronization
2239   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2240   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2241   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2242   //     Thread A is resumed to finish this native method, but doesn't block here since it
2243   //     didn't see any synchronization is progress, and escapes.
2244   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2245 
2246   // Force this write out before the read below
2247   if (!UseSystemMemoryBarrier) {
2248     __ membar(Assembler::Membar_mask_bits(
2249               Assembler::LoadLoad | Assembler::LoadStore |
2250               Assembler::StoreLoad | Assembler::StoreStore));
2251   }
2252 
2253   // check for safepoint operation in progress and/or pending suspend requests
2254   {
2255     Label Continue;
2256     Label slow_path;
2257 
2258     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2259 
2260     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2261     __ jcc(Assembler::equal, Continue);
2262     __ bind(slow_path);
2263 
2264     // Don't use call_VM as it will see a possible pending exception and forward it
2265     // and never return here preventing us from clearing _last_native_pc down below.
2266     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2267     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2268     // by hand.
2269     //
2270     __ vzeroupper();
2271     save_native_result(masm, ret_type, stack_slots);
2272     __ mov(c_rarg0, r15_thread);
2273     __ mov(r12, rsp); // remember sp
2274     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2275     __ andptr(rsp, -16); // align stack as required by ABI
2276     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2277     __ mov(rsp, r12); // restore sp
2278     __ reinit_heapbase();
2279     // Restore any method result value
2280     restore_native_result(masm, ret_type, stack_slots);
2281     __ bind(Continue);
2282   }
2283 
2284   // change thread state
2285   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2286   __ bind(after_transition);
2287 
2288   Label reguard;
2289   Label reguard_done;
2290   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2291   __ jcc(Assembler::equal, reguard);
2292   __ bind(reguard_done);
2293 
2294   // native result if any is live
2295 
2296   // Unlock
2297   Label slow_path_unlock;
2298   Label unlock_done;
2299   if (method->is_synchronized()) {
2300 
2301     Label fast_done;
2302 
2303     // Get locked oop from the handle we passed to jni
2304     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2305 
2306     if (LockingMode == LM_LEGACY) {
2307       Label not_recur;
2308       // Simple recursive lock?
2309       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2310       __ jcc(Assembler::notEqual, not_recur);
2311       __ dec_held_monitor_count();
2312       __ jmpb(fast_done);
2313       __ bind(not_recur);
2314     }
2315 
2316     // Must save rax if it is live now because cmpxchg must use it
2317     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2318       save_native_result(masm, ret_type, stack_slots);
2319     }
2320 
2321     if (LockingMode == LM_MONITOR) {
2322       __ jmp(slow_path_unlock);
2323     } else if (LockingMode == LM_LEGACY) {
2324       // get address of the stack lock
2325       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2326       //  get old displaced header
2327       __ movptr(old_hdr, Address(rax, 0));
2328 
2329       // Atomic swap old header if oop still contains the stack lock
2330       __ lock();
2331       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2332       __ jcc(Assembler::notEqual, slow_path_unlock);
2333       __ dec_held_monitor_count();
2334     } else {
2335       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2336       __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2337       __ andptr(swap_reg, ~(int32_t)markWord::lock_mask_in_place);
2338       __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2339       __ dec_held_monitor_count();
2340     }
2341 
2342     // slow path re-enters here
2343     __ bind(unlock_done);
2344     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2345       restore_native_result(masm, ret_type, stack_slots);
2346     }
2347 
2348     __ bind(fast_done);
2349   }
2350   {
2351     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2352     save_native_result(masm, ret_type, stack_slots);
2353     __ mov_metadata(c_rarg1, method());
2354     __ call_VM_leaf(
2355          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2356          r15_thread, c_rarg1);
2357     restore_native_result(masm, ret_type, stack_slots);
2358   }
2359 
2360   __ reset_last_Java_frame(false);
2361 
2362   // Unbox oop result, e.g. JNIHandles::resolve value.
2363   if (is_reference_type(ret_type)) {
2364     __ resolve_jobject(rax /* value */,
2365                        r15_thread /* thread */,
2366                        rcx /* tmp */);
2367   }
2368 
2369   if (CheckJNICalls) {
2370     // clear_pending_jni_exception_check
2371     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2372   }
2373 
2374   // reset handle block
2375   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2376   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2377 
2378   // pop our frame
2379 
2380   __ leave();
2381 
2382   // Any exception pending?
2383   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2384   __ jcc(Assembler::notEqual, exception_pending);
2385 
2386   // Return
2387 
2388   __ ret(0);
2389 
2390   // Unexpected paths are out of line and go here
2391 
2392   // forward the exception
2393   __ bind(exception_pending);
2394 
2395   // and forward the exception
2396   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2397 
2398   // Slow path locking & unlocking
2399   if (method->is_synchronized()) {
2400 
2401     // BEGIN Slow path lock
2402     __ bind(slow_path_lock);
2403 
2404     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2405     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2406 
2407     // protect the args we've loaded
2408     save_args(masm, total_c_args, c_arg, out_regs);
2409 
2410     __ mov(c_rarg0, obj_reg);
2411     __ mov(c_rarg1, lock_reg);
2412     __ mov(c_rarg2, r15_thread);
2413 
2414     // Not a leaf but we have last_Java_frame setup as we want
2415     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2416     restore_args(masm, total_c_args, c_arg, out_regs);
2417 
2418 #ifdef ASSERT
2419     { Label L;
2420     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2421     __ jcc(Assembler::equal, L);
2422     __ stop("no pending exception allowed on exit from monitorenter");
2423     __ bind(L);
2424     }
2425 #endif
2426     __ jmp(lock_done);
2427 
2428     // END Slow path lock
2429 
2430     // BEGIN Slow path unlock
2431     __ bind(slow_path_unlock);
2432 
2433     // If we haven't already saved the native result we must save it now as xmm registers
2434     // are still exposed.
2435     __ vzeroupper();
2436     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2437       save_native_result(masm, ret_type, stack_slots);
2438     }
2439 
2440     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2441 
2442     __ mov(c_rarg0, obj_reg);
2443     __ mov(c_rarg2, r15_thread);
2444     __ mov(r12, rsp); // remember sp
2445     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2446     __ andptr(rsp, -16); // align stack as required by ABI
2447 
2448     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2449     // NOTE that obj_reg == rbx currently
2450     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2451     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2452 
2453     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2454     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2455     __ mov(rsp, r12); // restore sp
2456     __ reinit_heapbase();
2457 #ifdef ASSERT
2458     {
2459       Label L;
2460       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2461       __ jcc(Assembler::equal, L);
2462       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2463       __ bind(L);
2464     }
2465 #endif /* ASSERT */
2466 
2467     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2468 
2469     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2470       restore_native_result(masm, ret_type, stack_slots);
2471     }
2472     __ jmp(unlock_done);
2473 
2474     // END Slow path unlock
2475 
2476   } // synchronized
2477 
2478   // SLOW PATH Reguard the stack if needed
2479 
2480   __ bind(reguard);
2481   __ vzeroupper();
2482   save_native_result(masm, ret_type, stack_slots);
2483   __ mov(r12, rsp); // remember sp
2484   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2485   __ andptr(rsp, -16); // align stack as required by ABI
2486   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2487   __ mov(rsp, r12); // restore sp
2488   __ reinit_heapbase();
2489   restore_native_result(masm, ret_type, stack_slots);
2490   // and continue
2491   __ jmp(reguard_done);
2492 
2493 
2494 
2495   __ flush();
2496 
2497   nmethod *nm = nmethod::new_native_nmethod(method,
2498                                             compile_id,
2499                                             masm->code(),
2500                                             vep_offset,
2501                                             frame_complete,
2502                                             stack_slots / VMRegImpl::slots_per_word,
2503                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2504                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2505                                             oop_maps);
2506 
2507   return nm;
2508 }
2509 
2510 // this function returns the adjust size (in number of words) to a c2i adapter
2511 // activation for use during deoptimization
2512 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2513   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2514 }
2515 
2516 
2517 uint SharedRuntime::out_preserve_stack_slots() {
2518   return 0;
2519 }
2520 
2521 
2522 // Number of stack slots between incoming argument block and the start of
2523 // a new frame.  The PROLOG must add this many slots to the stack.  The
2524 // EPILOG must remove this many slots.  amd64 needs two slots for
2525 // return address.
2526 uint SharedRuntime::in_preserve_stack_slots() {
2527   return 4 + 2 * VerifyStackAtCalls;
2528 }
2529 
2530 //------------------------------generate_deopt_blob----------------------------
2531 void SharedRuntime::generate_deopt_blob() {
2532   // Allocate space for the code
2533   ResourceMark rm;
2534   // Setup code generation tools
2535   int pad = 0;
2536   if (UseAVX > 2) {
2537     pad += 1024;
2538   }
2539 #if INCLUDE_JVMCI
2540   if (EnableJVMCI) {
2541     pad += 512; // Increase the buffer size when compiling for JVMCI
2542   }
2543 #endif
2544   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2545   MacroAssembler* masm = new MacroAssembler(&buffer);
2546   int frame_size_in_words;
2547   OopMap* map = nullptr;
2548   OopMapSet *oop_maps = new OopMapSet();
2549 
2550   // -------------
2551   // This code enters when returning to a de-optimized nmethod.  A return
2552   // address has been pushed on the stack, and return values are in
2553   // registers.
2554   // If we are doing a normal deopt then we were called from the patched
2555   // nmethod from the point we returned to the nmethod. So the return
2556   // address on the stack is wrong by NativeCall::instruction_size
2557   // We will adjust the value so it looks like we have the original return
2558   // address on the stack (like when we eagerly deoptimized).
2559   // In the case of an exception pending when deoptimizing, we enter
2560   // with a return address on the stack that points after the call we patched
2561   // into the exception handler. We have the following register state from,
2562   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2563   //    rax: exception oop
2564   //    rbx: exception handler
2565   //    rdx: throwing pc
2566   // So in this case we simply jam rdx into the useless return address and
2567   // the stack looks just like we want.
2568   //
2569   // At this point we need to de-opt.  We save the argument return
2570   // registers.  We call the first C routine, fetch_unroll_info().  This
2571   // routine captures the return values and returns a structure which
2572   // describes the current frame size and the sizes of all replacement frames.
2573   // The current frame is compiled code and may contain many inlined
2574   // functions, each with their own JVM state.  We pop the current frame, then
2575   // push all the new frames.  Then we call the C routine unpack_frames() to
2576   // populate these frames.  Finally unpack_frames() returns us the new target
2577   // address.  Notice that callee-save registers are BLOWN here; they have
2578   // already been captured in the vframeArray at the time the return PC was
2579   // patched.
2580   address start = __ pc();
2581   Label cont;
2582 
2583   // Prolog for non exception case!
2584 
2585   // Save everything in sight.
2586   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2587 
2588   // Normal deoptimization.  Save exec mode for unpack_frames.
2589   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2590   __ jmp(cont);
2591 
2592   int reexecute_offset = __ pc() - start;
2593 #if INCLUDE_JVMCI && !defined(COMPILER1)
2594   if (EnableJVMCI && UseJVMCICompiler) {
2595     // JVMCI does not use this kind of deoptimization
2596     __ should_not_reach_here();
2597   }
2598 #endif
2599 
2600   // Reexecute case
2601   // return address is the pc describes what bci to do re-execute at
2602 
2603   // No need to update map as each call to save_live_registers will produce identical oopmap
2604   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2605 
2606   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2607   __ jmp(cont);
2608 
2609 #if INCLUDE_JVMCI
2610   Label after_fetch_unroll_info_call;
2611   int implicit_exception_uncommon_trap_offset = 0;
2612   int uncommon_trap_offset = 0;
2613 
2614   if (EnableJVMCI) {
2615     implicit_exception_uncommon_trap_offset = __ pc() - start;
2616 
2617     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2618     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2619 
2620     uncommon_trap_offset = __ pc() - start;
2621 
2622     // Save everything in sight.
2623     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2624     // fetch_unroll_info needs to call last_java_frame()
2625     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2626 
2627     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2628     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2629 
2630     __ movl(r14, Deoptimization::Unpack_reexecute);
2631     __ mov(c_rarg0, r15_thread);
2632     __ movl(c_rarg2, r14); // exec mode
2633     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2634     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2635 
2636     __ reset_last_Java_frame(false);
2637 
2638     __ jmp(after_fetch_unroll_info_call);
2639   } // EnableJVMCI
2640 #endif // INCLUDE_JVMCI
2641 
2642   int exception_offset = __ pc() - start;
2643 
2644   // Prolog for exception case
2645 
2646   // all registers are dead at this entry point, except for rax, and
2647   // rdx which contain the exception oop and exception pc
2648   // respectively.  Set them in TLS and fall thru to the
2649   // unpack_with_exception_in_tls entry point.
2650 
2651   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2652   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2653 
2654   int exception_in_tls_offset = __ pc() - start;
2655 
2656   // new implementation because exception oop is now passed in JavaThread
2657 
2658   // Prolog for exception case
2659   // All registers must be preserved because they might be used by LinearScan
2660   // Exceptiop oop and throwing PC are passed in JavaThread
2661   // tos: stack at point of call to method that threw the exception (i.e. only
2662   // args are on the stack, no return address)
2663 
2664   // make room on stack for the return address
2665   // It will be patched later with the throwing pc. The correct value is not
2666   // available now because loading it from memory would destroy registers.
2667   __ push(0);
2668 
2669   // Save everything in sight.
2670   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2671 
2672   // Now it is safe to overwrite any register
2673 
2674   // Deopt during an exception.  Save exec mode for unpack_frames.
2675   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2676 
2677   // load throwing pc from JavaThread and patch it as the return address
2678   // of the current frame. Then clear the field in JavaThread
2679 
2680   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2681   __ movptr(Address(rbp, wordSize), rdx);
2682   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2683 
2684 #ifdef ASSERT
2685   // verify that there is really an exception oop in JavaThread
2686   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2687   __ verify_oop(rax);
2688 
2689   // verify that there is no pending exception
2690   Label no_pending_exception;
2691   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2692   __ testptr(rax, rax);
2693   __ jcc(Assembler::zero, no_pending_exception);
2694   __ stop("must not have pending exception here");
2695   __ bind(no_pending_exception);
2696 #endif
2697 
2698   __ bind(cont);
2699 
2700   // Call C code.  Need thread and this frame, but NOT official VM entry
2701   // crud.  We cannot block on this call, no GC can happen.
2702   //
2703   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2704 
2705   // fetch_unroll_info needs to call last_java_frame().
2706 
2707   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2708 #ifdef ASSERT
2709   { Label L;
2710     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2711     __ jcc(Assembler::equal, L);
2712     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2713     __ bind(L);
2714   }
2715 #endif // ASSERT
2716   __ mov(c_rarg0, r15_thread);
2717   __ movl(c_rarg1, r14); // exec_mode
2718   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2719 
2720   // Need to have an oopmap that tells fetch_unroll_info where to
2721   // find any register it might need.
2722   oop_maps->add_gc_map(__ pc() - start, map);
2723 
2724   __ reset_last_Java_frame(false);
2725 
2726 #if INCLUDE_JVMCI
2727   if (EnableJVMCI) {
2728     __ bind(after_fetch_unroll_info_call);
2729   }
2730 #endif
2731 
2732   // Load UnrollBlock* into rdi
2733   __ mov(rdi, rax);
2734 
2735   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2736    Label noException;
2737   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2738   __ jcc(Assembler::notEqual, noException);
2739   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2740   // QQQ this is useless it was null above
2741   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2742   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2743   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2744 
2745   __ verify_oop(rax);
2746 
2747   // Overwrite the result registers with the exception results.
2748   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2749   // I think this is useless
2750   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2751 
2752   __ bind(noException);
2753 
2754   // Only register save data is on the stack.
2755   // Now restore the result registers.  Everything else is either dead
2756   // or captured in the vframeArray.
2757   RegisterSaver::restore_result_registers(masm);
2758 
2759   // All of the register save area has been popped of the stack. Only the
2760   // return address remains.
2761 
2762   // Pop all the frames we must move/replace.
2763   //
2764   // Frame picture (youngest to oldest)
2765   // 1: self-frame (no frame link)
2766   // 2: deopting frame  (no frame link)
2767   // 3: caller of deopting frame (could be compiled/interpreted).
2768   //
2769   // Note: by leaving the return address of self-frame on the stack
2770   // and using the size of frame 2 to adjust the stack
2771   // when we are done the return to frame 3 will still be on the stack.
2772 
2773   // Pop deoptimized frame
2774   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2775   __ addptr(rsp, rcx);
2776 
2777   // rsp should be pointing at the return address to the caller (3)
2778 
2779   // Pick up the initial fp we should save
2780   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2781   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2782 
2783 #ifdef ASSERT
2784   // Compilers generate code that bang the stack by as much as the
2785   // interpreter would need. So this stack banging should never
2786   // trigger a fault. Verify that it does not on non product builds.
2787   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2788   __ bang_stack_size(rbx, rcx);
2789 #endif
2790 
2791   // Load address of array of frame pcs into rcx
2792   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2793 
2794   // Trash the old pc
2795   __ addptr(rsp, wordSize);
2796 
2797   // Load address of array of frame sizes into rsi
2798   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2799 
2800   // Load counter into rdx
2801   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2802 
2803   // Now adjust the caller's stack to make up for the extra locals
2804   // but record the original sp so that we can save it in the skeletal interpreter
2805   // frame and the stack walking of interpreter_sender will get the unextended sp
2806   // value and not the "real" sp value.
2807 
2808   const Register sender_sp = r8;
2809 
2810   __ mov(sender_sp, rsp);
2811   __ movl(rbx, Address(rdi,
2812                        Deoptimization::UnrollBlock::
2813                        caller_adjustment_offset()));
2814   __ subptr(rsp, rbx);
2815 
2816   // Push interpreter frames in a loop
2817   Label loop;
2818   __ bind(loop);
2819   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2820   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2821   __ pushptr(Address(rcx, 0));          // Save return address
2822   __ enter();                           // Save old & set new ebp
2823   __ subptr(rsp, rbx);                  // Prolog
2824   // This value is corrected by layout_activation_impl
2825   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2826   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2827   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2828   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2829   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2830   __ decrementl(rdx);                   // Decrement counter
2831   __ jcc(Assembler::notZero, loop);
2832   __ pushptr(Address(rcx, 0));          // Save final return address
2833 
2834   // Re-push self-frame
2835   __ enter();                           // Save old & set new ebp
2836 
2837   // Allocate a full sized register save area.
2838   // Return address and rbp are in place, so we allocate two less words.
2839   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2840 
2841   // Restore frame locals after moving the frame
2842   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2843   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2844 
2845   // Call C code.  Need thread but NOT official VM entry
2846   // crud.  We cannot block on this call, no GC can happen.  Call should
2847   // restore return values to their stack-slots with the new SP.
2848   //
2849   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2850 
2851   // Use rbp because the frames look interpreted now
2852   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2853   // Don't need the precise return PC here, just precise enough to point into this code blob.
2854   address the_pc = __ pc();
2855   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2856 
2857   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2858   __ mov(c_rarg0, r15_thread);
2859   __ movl(c_rarg1, r14); // second arg: exec_mode
2860   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2861   // Revert SP alignment after call since we're going to do some SP relative addressing below
2862   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2863 
2864   // Set an oopmap for the call site
2865   // Use the same PC we used for the last java frame
2866   oop_maps->add_gc_map(the_pc - start,
2867                        new OopMap( frame_size_in_words, 0 ));
2868 
2869   // Clear fp AND pc
2870   __ reset_last_Java_frame(true);
2871 
2872   // Collect return values
2873   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2874   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2875   // I think this is useless (throwing pc?)
2876   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2877 
2878   // Pop self-frame.
2879   __ leave();                           // Epilog
2880 
2881   // Jump to interpreter
2882   __ ret(0);
2883 
2884   // Make sure all code is generated
2885   masm->flush();
2886 
2887   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2888   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2889 #if INCLUDE_JVMCI
2890   if (EnableJVMCI) {
2891     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2892     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2893   }
2894 #endif
2895 }
2896 
2897 #ifdef COMPILER2
2898 //------------------------------generate_uncommon_trap_blob--------------------
2899 void SharedRuntime::generate_uncommon_trap_blob() {
2900   // Allocate space for the code
2901   ResourceMark rm;
2902   // Setup code generation tools
2903   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2904   MacroAssembler* masm = new MacroAssembler(&buffer);
2905 
2906   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2907 
2908   address start = __ pc();
2909 
2910   if (UseRTMLocking) {
2911     // Abort RTM transaction before possible nmethod deoptimization.
2912     __ xabort(0);
2913   }
2914 
2915   // Push self-frame.  We get here with a return address on the
2916   // stack, so rsp is 8-byte aligned until we allocate our frame.
2917   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2918 
2919   // No callee saved registers. rbp is assumed implicitly saved
2920   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2921 
2922   // compiler left unloaded_class_index in j_rarg0 move to where the
2923   // runtime expects it.
2924   __ movl(c_rarg1, j_rarg0);
2925 
2926   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2927 
2928   // Call C code.  Need thread but NOT official VM entry
2929   // crud.  We cannot block on this call, no GC can happen.  Call should
2930   // capture callee-saved registers as well as return values.
2931   // Thread is in rdi already.
2932   //
2933   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2934 
2935   __ mov(c_rarg0, r15_thread);
2936   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2937   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2938 
2939   // Set an oopmap for the call site
2940   OopMapSet* oop_maps = new OopMapSet();
2941   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2942 
2943   // location of rbp is known implicitly by the frame sender code
2944 
2945   oop_maps->add_gc_map(__ pc() - start, map);
2946 
2947   __ reset_last_Java_frame(false);
2948 
2949   // Load UnrollBlock* into rdi
2950   __ mov(rdi, rax);
2951 
2952 #ifdef ASSERT
2953   { Label L;
2954     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
2955               Deoptimization::Unpack_uncommon_trap);
2956     __ jcc(Assembler::equal, L);
2957     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
2958     __ bind(L);
2959   }
2960 #endif
2961 
2962   // Pop all the frames we must move/replace.
2963   //
2964   // Frame picture (youngest to oldest)
2965   // 1: self-frame (no frame link)
2966   // 2: deopting frame  (no frame link)
2967   // 3: caller of deopting frame (could be compiled/interpreted).
2968 
2969   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2970   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2971 
2972   // Pop deoptimized frame (int)
2973   __ movl(rcx, Address(rdi,
2974                        Deoptimization::UnrollBlock::
2975                        size_of_deoptimized_frame_offset()));
2976   __ addptr(rsp, rcx);
2977 
2978   // rsp should be pointing at the return address to the caller (3)
2979 
2980   // Pick up the initial fp we should save
2981   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2982   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2983 
2984 #ifdef ASSERT
2985   // Compilers generate code that bang the stack by as much as the
2986   // interpreter would need. So this stack banging should never
2987   // trigger a fault. Verify that it does not on non product builds.
2988   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2989   __ bang_stack_size(rbx, rcx);
2990 #endif
2991 
2992   // Load address of array of frame pcs into rcx (address*)
2993   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2994 
2995   // Trash the return pc
2996   __ addptr(rsp, wordSize);
2997 
2998   // Load address of array of frame sizes into rsi (intptr_t*)
2999   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
3000 
3001   // Counter
3002   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
3003 
3004   // Now adjust the caller's stack to make up for the extra locals but
3005   // record the original sp so that we can save it in the skeletal
3006   // interpreter frame and the stack walking of interpreter_sender
3007   // will get the unextended sp value and not the "real" sp value.
3008 
3009   const Register sender_sp = r8;
3010 
3011   __ mov(sender_sp, rsp);
3012   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
3013   __ subptr(rsp, rbx);
3014 
3015   // Push interpreter frames in a loop
3016   Label loop;
3017   __ bind(loop);
3018   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3019   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3020   __ pushptr(Address(rcx, 0));     // Save return address
3021   __ enter();                      // Save old & set new rbp
3022   __ subptr(rsp, rbx);             // Prolog
3023   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3024             sender_sp);            // Make it walkable
3025   // This value is corrected by layout_activation_impl
3026   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3027   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3028   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3029   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3030   __ decrementl(rdx);              // Decrement counter
3031   __ jcc(Assembler::notZero, loop);
3032   __ pushptr(Address(rcx, 0));     // Save final return address
3033 
3034   // Re-push self-frame
3035   __ enter();                 // Save old & set new rbp
3036   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3037                               // Prolog
3038 
3039   // Use rbp because the frames look interpreted now
3040   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3041   // Don't need the precise return PC here, just precise enough to point into this code blob.
3042   address the_pc = __ pc();
3043   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3044 
3045   // Call C code.  Need thread but NOT official VM entry
3046   // crud.  We cannot block on this call, no GC can happen.  Call should
3047   // restore return values to their stack-slots with the new SP.
3048   // Thread is in rdi already.
3049   //
3050   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3051 
3052   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3053   __ mov(c_rarg0, r15_thread);
3054   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3055   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3056 
3057   // Set an oopmap for the call site
3058   // Use the same PC we used for the last java frame
3059   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3060 
3061   // Clear fp AND pc
3062   __ reset_last_Java_frame(true);
3063 
3064   // Pop self-frame.
3065   __ leave();                 // Epilog
3066 
3067   // Jump to interpreter
3068   __ ret(0);
3069 
3070   // Make sure all code is generated
3071   masm->flush();
3072 
3073   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3074                                                  SimpleRuntimeFrame::framesize >> 1);
3075 }
3076 #endif // COMPILER2
3077 
3078 //------------------------------generate_handler_blob------
3079 //
3080 // Generate a special Compile2Runtime blob that saves all registers,
3081 // and setup oopmap.
3082 //
3083 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3084   assert(StubRoutines::forward_exception_entry() != nullptr,
3085          "must be generated before");
3086 
3087   ResourceMark rm;
3088   OopMapSet *oop_maps = new OopMapSet();
3089   OopMap* map;
3090 
3091   // Allocate space for the code.  Setup code generation tools.
3092   CodeBuffer buffer("handler_blob", 2048, 1024);
3093   MacroAssembler* masm = new MacroAssembler(&buffer);
3094 
3095   address start   = __ pc();
3096   address call_pc = nullptr;
3097   int frame_size_in_words;
3098   bool cause_return = (poll_type == POLL_AT_RETURN);
3099   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3100 
3101   if (UseRTMLocking) {
3102     // Abort RTM transaction before calling runtime
3103     // because critical section will be large and will be
3104     // aborted anyway. Also nmethod could be deoptimized.
3105     __ xabort(0);
3106   }
3107 
3108   // Make room for return address (or push it again)
3109   if (!cause_return) {
3110     __ push(rbx);
3111   }
3112 
3113   // Save registers, fpu state, and flags
3114   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3115 
3116   // The following is basically a call_VM.  However, we need the precise
3117   // address of the call in order to generate an oopmap. Hence, we do all the
3118   // work ourselves.
3119 
3120   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3121 
3122   // The return address must always be correct so that frame constructor never
3123   // sees an invalid pc.
3124 
3125   if (!cause_return) {
3126     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3127     // Additionally, rbx is a callee saved register and we can look at it later to determine
3128     // if someone changed the return address for us!
3129     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3130     __ movptr(Address(rbp, wordSize), rbx);
3131   }
3132 
3133   // Do the call
3134   __ mov(c_rarg0, r15_thread);
3135   __ call(RuntimeAddress(call_ptr));
3136 
3137   // Set an oopmap for the call site.  This oopmap will map all
3138   // oop-registers and debug-info registers as callee-saved.  This
3139   // will allow deoptimization at this safepoint to find all possible
3140   // debug-info recordings, as well as let GC find all oops.
3141 
3142   oop_maps->add_gc_map( __ pc() - start, map);
3143 
3144   Label noException;
3145 
3146   __ reset_last_Java_frame(false);
3147 
3148   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3149   __ jcc(Assembler::equal, noException);
3150 
3151   // Exception pending
3152 
3153   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3154 
3155   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3156 
3157   // No exception case
3158   __ bind(noException);
3159 
3160   Label no_adjust;
3161 #ifdef ASSERT
3162   Label bail;
3163 #endif
3164   if (!cause_return) {
3165     Label no_prefix, not_special;
3166 
3167     // If our stashed return pc was modified by the runtime we avoid touching it
3168     __ cmpptr(rbx, Address(rbp, wordSize));
3169     __ jccb(Assembler::notEqual, no_adjust);
3170 
3171     // Skip over the poll instruction.
3172     // See NativeInstruction::is_safepoint_poll()
3173     // Possible encodings:
3174     //      85 00       test   %eax,(%rax)
3175     //      85 01       test   %eax,(%rcx)
3176     //      85 02       test   %eax,(%rdx)
3177     //      85 03       test   %eax,(%rbx)
3178     //      85 06       test   %eax,(%rsi)
3179     //      85 07       test   %eax,(%rdi)
3180     //
3181     //   41 85 00       test   %eax,(%r8)
3182     //   41 85 01       test   %eax,(%r9)
3183     //   41 85 02       test   %eax,(%r10)
3184     //   41 85 03       test   %eax,(%r11)
3185     //   41 85 06       test   %eax,(%r14)
3186     //   41 85 07       test   %eax,(%r15)
3187     //
3188     //      85 04 24    test   %eax,(%rsp)
3189     //   41 85 04 24    test   %eax,(%r12)
3190     //      85 45 00    test   %eax,0x0(%rbp)
3191     //   41 85 45 00    test   %eax,0x0(%r13)
3192 
3193     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3194     __ jcc(Assembler::notEqual, no_prefix);
3195     __ addptr(rbx, 1);
3196     __ bind(no_prefix);
3197 #ifdef ASSERT
3198     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3199 #endif
3200     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3201     // r12/rsp 0x04
3202     // r13/rbp 0x05
3203     __ movzbq(rcx, Address(rbx, 1));
3204     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3205     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3206     __ cmpptr(rcx, 1);
3207     __ jcc(Assembler::above, not_special);
3208     __ addptr(rbx, 1);
3209     __ bind(not_special);
3210 #ifdef ASSERT
3211     // Verify the correct encoding of the poll we're about to skip.
3212     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3213     __ jcc(Assembler::notEqual, bail);
3214     // Mask out the modrm bits
3215     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3216     // rax encodes to 0, so if the bits are nonzero it's incorrect
3217     __ jcc(Assembler::notZero, bail);
3218 #endif
3219     // Adjust return pc forward to step over the safepoint poll instruction
3220     __ addptr(rbx, 2);
3221     __ movptr(Address(rbp, wordSize), rbx);
3222   }
3223 
3224   __ bind(no_adjust);
3225   // Normal exit, restore registers and exit.
3226   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3227   __ ret(0);
3228 
3229 #ifdef ASSERT
3230   __ bind(bail);
3231   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3232 #endif
3233 
3234   // Make sure all code is generated
3235   masm->flush();
3236 
3237   // Fill-out other meta info
3238   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3239 }
3240 
3241 //
3242 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3243 //
3244 // Generate a stub that calls into vm to find out the proper destination
3245 // of a java call. All the argument registers are live at this point
3246 // but since this is generic code we don't know what they are and the caller
3247 // must do any gc of the args.
3248 //
3249 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3250   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3251 
3252   // allocate space for the code
3253   ResourceMark rm;
3254 
3255   CodeBuffer buffer(name, 1200, 512);
3256   MacroAssembler* masm = new MacroAssembler(&buffer);
3257 
3258   int frame_size_in_words;
3259 
3260   OopMapSet *oop_maps = new OopMapSet();
3261   OopMap* map = nullptr;
3262 
3263   int start = __ offset();
3264 
3265   // No need to save vector registers since they are caller-saved anyway.
3266   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3267 
3268   int frame_complete = __ offset();
3269 
3270   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3271 
3272   __ mov(c_rarg0, r15_thread);
3273 
3274   __ call(RuntimeAddress(destination));
3275 
3276 
3277   // Set an oopmap for the call site.
3278   // We need this not only for callee-saved registers, but also for volatile
3279   // registers that the compiler might be keeping live across a safepoint.
3280 
3281   oop_maps->add_gc_map( __ offset() - start, map);
3282 
3283   // rax contains the address we are going to jump to assuming no exception got installed
3284 
3285   // clear last_Java_sp
3286   __ reset_last_Java_frame(false);
3287   // check for pending exceptions
3288   Label pending;
3289   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3290   __ jcc(Assembler::notEqual, pending);
3291 
3292   // get the returned Method*
3293   __ get_vm_result_2(rbx, r15_thread);
3294   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3295 
3296   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3297 
3298   RegisterSaver::restore_live_registers(masm);
3299 
3300   // We are back to the original state on entry and ready to go.
3301 
3302   __ jmp(rax);
3303 
3304   // Pending exception after the safepoint
3305 
3306   __ bind(pending);
3307 
3308   RegisterSaver::restore_live_registers(masm);
3309 
3310   // exception pending => remove activation and forward to exception handler
3311 
3312   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3313 
3314   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3315   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3316 
3317   // -------------
3318   // make sure all code is generated
3319   masm->flush();
3320 
3321   // return the  blob
3322   // frame_size_words or bytes??
3323   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3324 }
3325 
3326 //------------------------------Montgomery multiplication------------------------
3327 //
3328 
3329 #ifndef _WINDOWS
3330 
3331 // Subtract 0:b from carry:a.  Return carry.
3332 static julong
3333 sub(julong a[], julong b[], julong carry, long len) {
3334   long long i = 0, cnt = len;
3335   julong tmp;
3336   asm volatile("clc; "
3337                "0: ; "
3338                "mov (%[b], %[i], 8), %[tmp]; "
3339                "sbb %[tmp], (%[a], %[i], 8); "
3340                "inc %[i]; dec %[cnt]; "
3341                "jne 0b; "
3342                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3343                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3344                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3345                : "memory");
3346   return tmp;
3347 }
3348 
3349 // Multiply (unsigned) Long A by Long B, accumulating the double-
3350 // length result into the accumulator formed of T0, T1, and T2.
3351 #define MACC(A, B, T0, T1, T2)                                  \
3352 do {                                                            \
3353   unsigned long hi, lo;                                         \
3354   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3355            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3356            : "r"(A), "a"(B) : "cc");                            \
3357  } while(0)
3358 
3359 // As above, but add twice the double-length result into the
3360 // accumulator.
3361 #define MACC2(A, B, T0, T1, T2)                                 \
3362 do {                                                            \
3363   unsigned long hi, lo;                                         \
3364   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3365            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3366            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3367            : "r"(A), "a"(B) : "cc");                            \
3368  } while(0)
3369 
3370 #else //_WINDOWS
3371 
3372 static julong
3373 sub(julong a[], julong b[], julong carry, long len) {
3374   long i;
3375   julong tmp;
3376   unsigned char c = 1;
3377   for (i = 0; i < len; i++) {
3378     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3379     a[i] = tmp;
3380   }
3381   c = _addcarry_u64(c, carry, ~0, &tmp);
3382   return tmp;
3383 }
3384 
3385 // Multiply (unsigned) Long A by Long B, accumulating the double-
3386 // length result into the accumulator formed of T0, T1, and T2.
3387 #define MACC(A, B, T0, T1, T2)                          \
3388 do {                                                    \
3389   julong hi, lo;                            \
3390   lo = _umul128(A, B, &hi);                             \
3391   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3392   c = _addcarry_u64(c, hi, T1, &T1);                    \
3393   _addcarry_u64(c, T2, 0, &T2);                         \
3394  } while(0)
3395 
3396 // As above, but add twice the double-length result into the
3397 // accumulator.
3398 #define MACC2(A, B, T0, T1, T2)                         \
3399 do {                                                    \
3400   julong hi, lo;                            \
3401   lo = _umul128(A, B, &hi);                             \
3402   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3403   c = _addcarry_u64(c, hi, T1, &T1);                    \
3404   _addcarry_u64(c, T2, 0, &T2);                         \
3405   c = _addcarry_u64(0, lo, T0, &T0);                    \
3406   c = _addcarry_u64(c, hi, T1, &T1);                    \
3407   _addcarry_u64(c, T2, 0, &T2);                         \
3408  } while(0)
3409 
3410 #endif //_WINDOWS
3411 
3412 // Fast Montgomery multiplication.  The derivation of the algorithm is
3413 // in  A Cryptographic Library for the Motorola DSP56000,
3414 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3415 
3416 static void NOINLINE
3417 montgomery_multiply(julong a[], julong b[], julong n[],
3418                     julong m[], julong inv, int len) {
3419   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3420   int i;
3421 
3422   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3423 
3424   for (i = 0; i < len; i++) {
3425     int j;
3426     for (j = 0; j < i; j++) {
3427       MACC(a[j], b[i-j], t0, t1, t2);
3428       MACC(m[j], n[i-j], t0, t1, t2);
3429     }
3430     MACC(a[i], b[0], t0, t1, t2);
3431     m[i] = t0 * inv;
3432     MACC(m[i], n[0], t0, t1, t2);
3433 
3434     assert(t0 == 0, "broken Montgomery multiply");
3435 
3436     t0 = t1; t1 = t2; t2 = 0;
3437   }
3438 
3439   for (i = len; i < 2*len; i++) {
3440     int j;
3441     for (j = i-len+1; j < len; j++) {
3442       MACC(a[j], b[i-j], t0, t1, t2);
3443       MACC(m[j], n[i-j], t0, t1, t2);
3444     }
3445     m[i-len] = t0;
3446     t0 = t1; t1 = t2; t2 = 0;
3447   }
3448 
3449   while (t0)
3450     t0 = sub(m, n, t0, len);
3451 }
3452 
3453 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3454 // multiplies so it should be up to 25% faster than Montgomery
3455 // multiplication.  However, its loop control is more complex and it
3456 // may actually run slower on some machines.
3457 
3458 static void NOINLINE
3459 montgomery_square(julong a[], julong n[],
3460                   julong m[], julong inv, int len) {
3461   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3462   int i;
3463 
3464   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3465 
3466   for (i = 0; i < len; i++) {
3467     int j;
3468     int end = (i+1)/2;
3469     for (j = 0; j < end; j++) {
3470       MACC2(a[j], a[i-j], t0, t1, t2);
3471       MACC(m[j], n[i-j], t0, t1, t2);
3472     }
3473     if ((i & 1) == 0) {
3474       MACC(a[j], a[j], t0, t1, t2);
3475     }
3476     for (; j < i; j++) {
3477       MACC(m[j], n[i-j], t0, t1, t2);
3478     }
3479     m[i] = t0 * inv;
3480     MACC(m[i], n[0], t0, t1, t2);
3481 
3482     assert(t0 == 0, "broken Montgomery square");
3483 
3484     t0 = t1; t1 = t2; t2 = 0;
3485   }
3486 
3487   for (i = len; i < 2*len; i++) {
3488     int start = i-len+1;
3489     int end = start + (len - start)/2;
3490     int j;
3491     for (j = start; j < end; j++) {
3492       MACC2(a[j], a[i-j], t0, t1, t2);
3493       MACC(m[j], n[i-j], t0, t1, t2);
3494     }
3495     if ((i & 1) == 0) {
3496       MACC(a[j], a[j], t0, t1, t2);
3497     }
3498     for (; j < len; j++) {
3499       MACC(m[j], n[i-j], t0, t1, t2);
3500     }
3501     m[i-len] = t0;
3502     t0 = t1; t1 = t2; t2 = 0;
3503   }
3504 
3505   while (t0)
3506     t0 = sub(m, n, t0, len);
3507 }
3508 
3509 // Swap words in a longword.
3510 static julong swap(julong x) {
3511   return (x << 32) | (x >> 32);
3512 }
3513 
3514 // Copy len longwords from s to d, word-swapping as we go.  The
3515 // destination array is reversed.
3516 static void reverse_words(julong *s, julong *d, int len) {
3517   d += len;
3518   while(len-- > 0) {
3519     d--;
3520     *d = swap(*s);
3521     s++;
3522   }
3523 }
3524 
3525 // The threshold at which squaring is advantageous was determined
3526 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3527 #define MONTGOMERY_SQUARING_THRESHOLD 64
3528 
3529 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3530                                         jint len, jlong inv,
3531                                         jint *m_ints) {
3532   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3533   int longwords = len/2;
3534 
3535   // Make very sure we don't use so much space that the stack might
3536   // overflow.  512 jints corresponds to an 16384-bit integer and
3537   // will use here a total of 8k bytes of stack space.
3538   int divisor = sizeof(julong) * 4;
3539   guarantee(longwords <= 8192 / divisor, "must be");
3540   int total_allocation = longwords * sizeof (julong) * 4;
3541   julong *scratch = (julong *)alloca(total_allocation);
3542 
3543   // Local scratch arrays
3544   julong
3545     *a = scratch + 0 * longwords,
3546     *b = scratch + 1 * longwords,
3547     *n = scratch + 2 * longwords,
3548     *m = scratch + 3 * longwords;
3549 
3550   reverse_words((julong *)a_ints, a, longwords);
3551   reverse_words((julong *)b_ints, b, longwords);
3552   reverse_words((julong *)n_ints, n, longwords);
3553 
3554   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3555 
3556   reverse_words(m, (julong *)m_ints, longwords);
3557 }
3558 
3559 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3560                                       jint len, jlong inv,
3561                                       jint *m_ints) {
3562   assert(len % 2 == 0, "array length in montgomery_square must be even");
3563   int longwords = len/2;
3564 
3565   // Make very sure we don't use so much space that the stack might
3566   // overflow.  512 jints corresponds to an 16384-bit integer and
3567   // will use here a total of 6k bytes of stack space.
3568   int divisor = sizeof(julong) * 3;
3569   guarantee(longwords <= (8192 / divisor), "must be");
3570   int total_allocation = longwords * sizeof (julong) * 3;
3571   julong *scratch = (julong *)alloca(total_allocation);
3572 
3573   // Local scratch arrays
3574   julong
3575     *a = scratch + 0 * longwords,
3576     *n = scratch + 1 * longwords,
3577     *m = scratch + 2 * longwords;
3578 
3579   reverse_words((julong *)a_ints, a, longwords);
3580   reverse_words((julong *)n_ints, n, longwords);
3581 
3582   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3583     ::montgomery_square(a, n, m, (julong)inv, longwords);
3584   } else {
3585     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3586   }
3587 
3588   reverse_words(m, (julong *)m_ints, longwords);
3589 }
3590 
3591 #ifdef COMPILER2
3592 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3593 //
3594 //------------------------------generate_exception_blob---------------------------
3595 // creates exception blob at the end
3596 // Using exception blob, this code is jumped from a compiled method.
3597 // (see emit_exception_handler in x86_64.ad file)
3598 //
3599 // Given an exception pc at a call we call into the runtime for the
3600 // handler in this method. This handler might merely restore state
3601 // (i.e. callee save registers) unwind the frame and jump to the
3602 // exception handler for the nmethod if there is no Java level handler
3603 // for the nmethod.
3604 //
3605 // This code is entered with a jmp.
3606 //
3607 // Arguments:
3608 //   rax: exception oop
3609 //   rdx: exception pc
3610 //
3611 // Results:
3612 //   rax: exception oop
3613 //   rdx: exception pc in caller or ???
3614 //   destination: exception handler of caller
3615 //
3616 // Note: the exception pc MUST be at a call (precise debug information)
3617 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3618 //
3619 
3620 void OptoRuntime::generate_exception_blob() {
3621   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3622   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3623   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3624 
3625   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3626 
3627   // Allocate space for the code
3628   ResourceMark rm;
3629   // Setup code generation tools
3630   CodeBuffer buffer("exception_blob", 2048, 1024);
3631   MacroAssembler* masm = new MacroAssembler(&buffer);
3632 
3633 
3634   address start = __ pc();
3635 
3636   // Exception pc is 'return address' for stack walker
3637   __ push(rdx);
3638   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3639 
3640   // Save callee-saved registers.  See x86_64.ad.
3641 
3642   // rbp is an implicitly saved callee saved register (i.e., the calling
3643   // convention will save/restore it in the prolog/epilog). Other than that
3644   // there are no callee save registers now that adapter frames are gone.
3645 
3646   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3647 
3648   // Store exception in Thread object. We cannot pass any arguments to the
3649   // handle_exception call, since we do not want to make any assumption
3650   // about the size of the frame where the exception happened in.
3651   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3652   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3653   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3654 
3655   // This call does all the hard work.  It checks if an exception handler
3656   // exists in the method.
3657   // If so, it returns the handler address.
3658   // If not, it prepares for stack-unwinding, restoring the callee-save
3659   // registers of the frame being removed.
3660   //
3661   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3662 
3663   // At a method handle call, the stack may not be properly aligned
3664   // when returning with an exception.
3665   address the_pc = __ pc();
3666   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3667   __ mov(c_rarg0, r15_thread);
3668   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3669   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3670 
3671   // Set an oopmap for the call site.  This oopmap will only be used if we
3672   // are unwinding the stack.  Hence, all locations will be dead.
3673   // Callee-saved registers will be the same as the frame above (i.e.,
3674   // handle_exception_stub), since they were restored when we got the
3675   // exception.
3676 
3677   OopMapSet* oop_maps = new OopMapSet();
3678 
3679   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3680 
3681   __ reset_last_Java_frame(false);
3682 
3683   // Restore callee-saved registers
3684 
3685   // rbp is an implicitly saved callee-saved register (i.e., the calling
3686   // convention will save restore it in prolog/epilog) Other than that
3687   // there are no callee save registers now that adapter frames are gone.
3688 
3689   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3690 
3691   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3692   __ pop(rdx);                  // No need for exception pc anymore
3693 
3694   // rax: exception handler
3695 
3696   // We have a handler in rax (could be deopt blob).
3697   __ mov(r8, rax);
3698 
3699   // Get the exception oop
3700   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3701   // Get the exception pc in case we are deoptimized
3702   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3703 #ifdef ASSERT
3704   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3705   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3706 #endif
3707   // Clear the exception oop so GC no longer processes it as a root.
3708   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3709 
3710   // rax: exception oop
3711   // r8:  exception handler
3712   // rdx: exception pc
3713   // Jump to handler
3714 
3715   __ jmp(r8);
3716 
3717   // Make sure all code is generated
3718   masm->flush();
3719 
3720   // Set exception blob
3721   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3722 }
3723 #endif // COMPILER2
3724