1 /*
   2  * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/icBuffer.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/compiledICHolder.hpp"
  46 #include "oops/klass.inline.hpp"
  47 #include "oops/method.inline.hpp"
  48 #include "prims/methodHandles.hpp"
  49 #include "runtime/continuation.hpp"
  50 #include "runtime/continuationEntry.inline.hpp"
  51 #include "runtime/globals.hpp"
  52 #include "runtime/jniHandles.hpp"
  53 #include "runtime/safepointMechanism.hpp"
  54 #include "runtime/sharedRuntime.hpp"
  55 #include "runtime/signature.hpp"
  56 #include "runtime/stubRoutines.hpp"
  57 #include "runtime/vframeArray.hpp"
  58 #include "runtime/vm_version.hpp"
  59 #include "utilities/align.hpp"
  60 #include "utilities/checkedCast.hpp"
  61 #include "utilities/formatBuffer.hpp"
  62 #include "vmreg_x86.inline.hpp"
  63 #ifdef COMPILER1
  64 #include "c1/c1_Runtime1.hpp"
  65 #endif
  66 #ifdef COMPILER2
  67 #include "opto/runtime.hpp"
  68 #endif
  69 #if INCLUDE_JVMCI
  70 #include "jvmci/jvmciJavaClasses.hpp"
  71 #endif
  72 
  73 #define __ masm->
  74 
  75 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  76 
  77 class SimpleRuntimeFrame {
  78 
  79   public:
  80 
  81   // Most of the runtime stubs have this simple frame layout.
  82   // This class exists to make the layout shared in one place.
  83   // Offsets are for compiler stack slots, which are jints.
  84   enum layout {
  85     // The frame sender code expects that rbp will be in the "natural" place and
  86     // will override any oopMap setting for it. We must therefore force the layout
  87     // so that it agrees with the frame sender code.
  88     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  89     rbp_off2,
  90     return_off, return_off2,
  91     framesize
  92   };
  93 };
  94 
  95 class RegisterSaver {
  96   // Capture info about frame layout.  Layout offsets are in jint
  97   // units because compiler frame slots are jints.
  98 #define XSAVE_AREA_BEGIN 160
  99 #define XSAVE_AREA_YMM_BEGIN 576
 100 #define XSAVE_AREA_OPMASK_BEGIN 1088
 101 #define XSAVE_AREA_ZMM_BEGIN 1152
 102 #define XSAVE_AREA_UPPERBANK 1664
 103 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 104 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 105 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 106 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 107 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 108   enum layout {
 109     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 110     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 111     DEF_XMM_OFFS(0),
 112     DEF_XMM_OFFS(1),
 113     // 2..15 are implied in range usage
 114     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 115     DEF_YMM_OFFS(0),
 116     DEF_YMM_OFFS(1),
 117     // 2..15 are implied in range usage
 118     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 119     DEF_OPMASK_OFFS(0),
 120     DEF_OPMASK_OFFS(1),
 121     // 2..7 are implied in range usage
 122     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 123     DEF_ZMM_OFFS(0),
 124     DEF_ZMM_OFFS(1),
 125     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 126     DEF_ZMM_UPPER_OFFS(16),
 127     DEF_ZMM_UPPER_OFFS(17),
 128     // 18..31 are implied in range usage
 129     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 130     fpu_stateH_end,
 131     r15_off, r15H_off,
 132     r14_off, r14H_off,
 133     r13_off, r13H_off,
 134     r12_off, r12H_off,
 135     r11_off, r11H_off,
 136     r10_off, r10H_off,
 137     r9_off,  r9H_off,
 138     r8_off,  r8H_off,
 139     rdi_off, rdiH_off,
 140     rsi_off, rsiH_off,
 141     ignore_off, ignoreH_off,  // extra copy of rbp
 142     rsp_off, rspH_off,
 143     rbx_off, rbxH_off,
 144     rdx_off, rdxH_off,
 145     rcx_off, rcxH_off,
 146     rax_off, raxH_off,
 147     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 148     align_off, alignH_off,
 149     flags_off, flagsH_off,
 150     // The frame sender code expects that rbp will be in the "natural" place and
 151     // will override any oopMap setting for it. We must therefore force the layout
 152     // so that it agrees with the frame sender code.
 153     rbp_off, rbpH_off,        // copy of rbp we will restore
 154     return_off, returnH_off,  // slot for return address
 155     reg_save_size             // size in compiler stack slots
 156   };
 157 
 158  public:
 159   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 160   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 161 
 162   // Offsets into the register save area
 163   // Used by deoptimization when it is managing result register
 164   // values on its own
 165 
 166   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 167   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 168   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 169   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 170   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 171 
 172   // During deoptimization only the result registers need to be restored,
 173   // all the other values have already been extracted.
 174   static void restore_result_registers(MacroAssembler* masm);
 175 };
 176 
 177 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 178   int off = 0;
 179   int num_xmm_regs = XMMRegister::available_xmm_registers();
 180 #if COMPILER2_OR_JVMCI
 181   if (save_wide_vectors && UseAVX == 0) {
 182     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 183   }
 184   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 185 #else
 186   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 187 #endif
 188 
 189   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 190   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 191   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 192   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 193   // CodeBlob frame size is in words.
 194   int frame_size_in_words = frame_size_in_bytes / wordSize;
 195   *total_frame_words = frame_size_in_words;
 196 
 197   // Save registers, fpu state, and flags.
 198   // We assume caller has already pushed the return address onto the
 199   // stack, so rsp is 8-byte aligned here.
 200   // We push rpb twice in this sequence because we want the real rbp
 201   // to be under the return like a normal enter.
 202 
 203   __ enter();          // rsp becomes 16-byte aligned here
 204   __ push_CPU_state(); // Push a multiple of 16 bytes
 205 
 206   // push cpu state handles this on EVEX enabled targets
 207   if (save_wide_vectors) {
 208     // Save upper half of YMM registers(0..15)
 209     int base_addr = XSAVE_AREA_YMM_BEGIN;
 210     for (int n = 0; n < 16; n++) {
 211       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 212     }
 213     if (VM_Version::supports_evex()) {
 214       // Save upper half of ZMM registers(0..15)
 215       base_addr = XSAVE_AREA_ZMM_BEGIN;
 216       for (int n = 0; n < 16; n++) {
 217         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 218       }
 219       // Save full ZMM registers(16..num_xmm_regs)
 220       base_addr = XSAVE_AREA_UPPERBANK;
 221       off = 0;
 222       int vector_len = Assembler::AVX_512bit;
 223       for (int n = 16; n < num_xmm_regs; n++) {
 224         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 225       }
 226 #if COMPILER2_OR_JVMCI
 227       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 228       off = 0;
 229       for(int n = 0; n < KRegister::number_of_registers; n++) {
 230         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 231       }
 232 #endif
 233     }
 234   } else {
 235     if (VM_Version::supports_evex()) {
 236       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 237       int base_addr = XSAVE_AREA_UPPERBANK;
 238       off = 0;
 239       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 240       for (int n = 16; n < num_xmm_regs; n++) {
 241         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 242       }
 243 #if COMPILER2_OR_JVMCI
 244       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 245       off = 0;
 246       for(int n = 0; n < KRegister::number_of_registers; n++) {
 247         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 248       }
 249 #endif
 250     }
 251   }
 252   __ vzeroupper();
 253   if (frame::arg_reg_save_area_bytes != 0) {
 254     // Allocate argument register save area
 255     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 256   }
 257 
 258   // Set an oopmap for the call site.  This oopmap will map all
 259   // oop-registers and debug-info registers as callee-saved.  This
 260   // will allow deoptimization at this safepoint to find all possible
 261   // debug-info recordings, as well as let GC find all oops.
 262 
 263   OopMapSet *oop_maps = new OopMapSet();
 264   OopMap* map = new OopMap(frame_size_in_slots, 0);
 265 
 266 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 267 
 268   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 271   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 272   // rbp location is known implicitly by the frame sender code, needs no oopmap
 273   // and the location where rbp was saved by is ignored
 274   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 283   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 284   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 285   // on EVEX enabled targets, we get it included in the xsave area
 286   off = xmm0_off;
 287   int delta = xmm1_off - off;
 288   for (int n = 0; n < 16; n++) {
 289     XMMRegister xmm_name = as_XMMRegister(n);
 290     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 291     off += delta;
 292   }
 293   if (UseAVX > 2) {
 294     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 295     off = zmm16_off;
 296     delta = zmm17_off - off;
 297     for (int n = 16; n < num_xmm_regs; n++) {
 298       XMMRegister zmm_name = as_XMMRegister(n);
 299       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 300       off += delta;
 301     }
 302   }
 303 
 304 #if COMPILER2_OR_JVMCI
 305   if (save_wide_vectors) {
 306     // Save upper half of YMM registers(0..15)
 307     off = ymm0_off;
 308     delta = ymm1_off - ymm0_off;
 309     for (int n = 0; n < 16; n++) {
 310       XMMRegister ymm_name = as_XMMRegister(n);
 311       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 312       off += delta;
 313     }
 314     if (VM_Version::supports_evex()) {
 315       // Save upper half of ZMM registers(0..15)
 316       off = zmm0_off;
 317       delta = zmm1_off - zmm0_off;
 318       for (int n = 0; n < 16; n++) {
 319         XMMRegister zmm_name = as_XMMRegister(n);
 320         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 321         off += delta;
 322       }
 323     }
 324   }
 325 #endif // COMPILER2_OR_JVMCI
 326 
 327   // %%% These should all be a waste but we'll keep things as they were for now
 328   if (true) {
 329     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 333     // rbp location is known implicitly by the frame sender code, needs no oopmap
 334     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 343     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 344     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 345     // on EVEX enabled targets, we get it included in the xsave area
 346     off = xmm0H_off;
 347     delta = xmm1H_off - off;
 348     for (int n = 0; n < 16; n++) {
 349       XMMRegister xmm_name = as_XMMRegister(n);
 350       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 351       off += delta;
 352     }
 353     if (UseAVX > 2) {
 354       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 355       off = zmm16H_off;
 356       delta = zmm17H_off - off;
 357       for (int n = 16; n < num_xmm_regs; n++) {
 358         XMMRegister zmm_name = as_XMMRegister(n);
 359         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 360         off += delta;
 361       }
 362     }
 363   }
 364 
 365   return map;
 366 }
 367 
 368 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 369   int num_xmm_regs = XMMRegister::available_xmm_registers();
 370   if (frame::arg_reg_save_area_bytes != 0) {
 371     // Pop arg register save area
 372     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 373   }
 374 
 375 #if COMPILER2_OR_JVMCI
 376   if (restore_wide_vectors) {
 377     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 378     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 379   }
 380 #else
 381   assert(!restore_wide_vectors, "vectors are generated only by C2");
 382 #endif
 383 
 384   __ vzeroupper();
 385 
 386   // On EVEX enabled targets everything is handled in pop fpu state
 387   if (restore_wide_vectors) {
 388     // Restore upper half of YMM registers (0..15)
 389     int base_addr = XSAVE_AREA_YMM_BEGIN;
 390     for (int n = 0; n < 16; n++) {
 391       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 392     }
 393     if (VM_Version::supports_evex()) {
 394       // Restore upper half of ZMM registers (0..15)
 395       base_addr = XSAVE_AREA_ZMM_BEGIN;
 396       for (int n = 0; n < 16; n++) {
 397         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 398       }
 399       // Restore full ZMM registers(16..num_xmm_regs)
 400       base_addr = XSAVE_AREA_UPPERBANK;
 401       int vector_len = Assembler::AVX_512bit;
 402       int off = 0;
 403       for (int n = 16; n < num_xmm_regs; n++) {
 404         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 405       }
 406 #if COMPILER2_OR_JVMCI
 407       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 408       off = 0;
 409       for (int n = 0; n < KRegister::number_of_registers; n++) {
 410         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 411       }
 412 #endif
 413     }
 414   } else {
 415     if (VM_Version::supports_evex()) {
 416       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 417       int base_addr = XSAVE_AREA_UPPERBANK;
 418       int off = 0;
 419       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 420       for (int n = 16; n < num_xmm_regs; n++) {
 421         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 422       }
 423 #if COMPILER2_OR_JVMCI
 424       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 425       off = 0;
 426       for (int n = 0; n < KRegister::number_of_registers; n++) {
 427         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 428       }
 429 #endif
 430     }
 431   }
 432 
 433   // Recover CPU state
 434   __ pop_CPU_state();
 435   // Get the rbp described implicitly by the calling convention (no oopMap)
 436   __ pop(rbp);
 437 }
 438 
 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 440 
 441   // Just restore result register. Only used by deoptimization. By
 442   // now any callee save register that needs to be restored to a c2
 443   // caller of the deoptee has been extracted into the vframeArray
 444   // and will be stuffed into the c2i adapter we create for later
 445   // restoration so only result registers need to be restored here.
 446 
 447   // Restore fp result register
 448   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 449   // Restore integer result register
 450   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 451   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 452 
 453   // Pop all of the register save are off the stack except the return address
 454   __ addptr(rsp, return_offset_in_bytes());
 455 }
 456 
 457 // Is vector's size (in bytes) bigger than a size saved by default?
 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 459 bool SharedRuntime::is_wide_vector(int size) {
 460   return size > 16;
 461 }
 462 
 463 // ---------------------------------------------------------------------------
 464 // Read the array of BasicTypes from a signature, and compute where the
 465 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 466 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 467 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 468 // as framesizes are fixed.
 469 // VMRegImpl::stack0 refers to the first slot 0(sp).
 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 471 // Register up to Register::number_of_registers are the 64-bit
 472 // integer registers.
 473 
 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 475 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 476 // units regardless of build. Of course for i486 there is no 64 bit build
 477 
 478 // The Java calling convention is a "shifted" version of the C ABI.
 479 // By skipping the first C ABI register we can call non-static jni methods
 480 // with small numbers of arguments without having to shuffle the arguments
 481 // at all. Since we control the java ABI we ought to at least get some
 482 // advantage out of it.
 483 
 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 485                                            VMRegPair *regs,
 486                                            int total_args_passed) {
 487 
 488   // Create the mapping between argument positions and
 489   // registers.
 490   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 491     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 492   };
 493   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 494     j_farg0, j_farg1, j_farg2, j_farg3,
 495     j_farg4, j_farg5, j_farg6, j_farg7
 496   };
 497 
 498 
 499   uint int_args = 0;
 500   uint fp_args = 0;
 501   uint stk_args = 0; // inc by 2 each time
 502 
 503   for (int i = 0; i < total_args_passed; i++) {
 504     switch (sig_bt[i]) {
 505     case T_BOOLEAN:
 506     case T_CHAR:
 507     case T_BYTE:
 508     case T_SHORT:
 509     case T_INT:
 510       if (int_args < Argument::n_int_register_parameters_j) {
 511         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 512       } else {
 513         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 514         stk_args += 2;
 515       }
 516       break;
 517     case T_VOID:
 518       // halves of T_LONG or T_DOUBLE
 519       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 520       regs[i].set_bad();
 521       break;
 522     case T_LONG:
 523       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 524       // fall through
 525     case T_OBJECT:
 526     case T_ARRAY:
 527     case T_ADDRESS:
 528       if (int_args < Argument::n_int_register_parameters_j) {
 529         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 530       } else {
 531         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 532         stk_args += 2;
 533       }
 534       break;
 535     case T_FLOAT:
 536       if (fp_args < Argument::n_float_register_parameters_j) {
 537         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 538       } else {
 539         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 540         stk_args += 2;
 541       }
 542       break;
 543     case T_DOUBLE:
 544       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 545       if (fp_args < Argument::n_float_register_parameters_j) {
 546         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 547       } else {
 548         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 549         stk_args += 2;
 550       }
 551       break;
 552     default:
 553       ShouldNotReachHere();
 554       break;
 555     }
 556   }
 557 
 558   return align_up(stk_args, 2);
 559 }
 560 
 561 // Patch the callers callsite with entry to compiled code if it exists.
 562 static void patch_callers_callsite(MacroAssembler *masm) {
 563   Label L;
 564   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 565   __ jcc(Assembler::equal, L);
 566 
 567   // Save the current stack pointer
 568   __ mov(r13, rsp);
 569   // Schedule the branch target address early.
 570   // Call into the VM to patch the caller, then jump to compiled callee
 571   // rax isn't live so capture return address while we easily can
 572   __ movptr(rax, Address(rsp, 0));
 573 
 574   // align stack so push_CPU_state doesn't fault
 575   __ andptr(rsp, -(StackAlignmentInBytes));
 576   __ push_CPU_state();
 577   __ vzeroupper();
 578   // VM needs caller's callsite
 579   // VM needs target method
 580   // This needs to be a long call since we will relocate this adapter to
 581   // the codeBuffer and it may not reach
 582 
 583   // Allocate argument register save area
 584   if (frame::arg_reg_save_area_bytes != 0) {
 585     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 586   }
 587   __ mov(c_rarg0, rbx);
 588   __ mov(c_rarg1, rax);
 589   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 590 
 591   // De-allocate argument register save area
 592   if (frame::arg_reg_save_area_bytes != 0) {
 593     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 594   }
 595 
 596   __ vzeroupper();
 597   __ pop_CPU_state();
 598   // restore sp
 599   __ mov(rsp, r13);
 600   __ bind(L);
 601 }
 602 
 603 
 604 static void gen_c2i_adapter(MacroAssembler *masm,
 605                             int total_args_passed,
 606                             int comp_args_on_stack,
 607                             const BasicType *sig_bt,
 608                             const VMRegPair *regs,
 609                             Label& skip_fixup) {
 610   // Before we get into the guts of the C2I adapter, see if we should be here
 611   // at all.  We've come from compiled code and are attempting to jump to the
 612   // interpreter, which means the caller made a static call to get here
 613   // (vcalls always get a compiled target if there is one).  Check for a
 614   // compiled target.  If there is one, we need to patch the caller's call.
 615   patch_callers_callsite(masm);
 616 
 617   __ bind(skip_fixup);
 618 
 619   // Since all args are passed on the stack, total_args_passed *
 620   // Interpreter::stackElementSize is the space we need.
 621 
 622   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 623 
 624   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 625 
 626   // stack is aligned, keep it that way
 627   // This is not currently needed or enforced by the interpreter, but
 628   // we might as well conform to the ABI.
 629   extraspace = align_up(extraspace, 2*wordSize);
 630 
 631   // set senderSP value
 632   __ lea(r13, Address(rsp, wordSize));
 633 
 634 #ifdef ASSERT
 635   __ check_stack_alignment(r13, "sender stack not aligned");
 636 #endif
 637   if (extraspace > 0) {
 638     // Pop the return address
 639     __ pop(rax);
 640 
 641     __ subptr(rsp, extraspace);
 642 
 643     // Push the return address
 644     __ push(rax);
 645 
 646     // Account for the return address location since we store it first rather
 647     // than hold it in a register across all the shuffling
 648     extraspace += wordSize;
 649   }
 650 
 651 #ifdef ASSERT
 652   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 653 #endif
 654 
 655   // Now write the args into the outgoing interpreter space
 656   for (int i = 0; i < total_args_passed; i++) {
 657     if (sig_bt[i] == T_VOID) {
 658       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 659       continue;
 660     }
 661 
 662     // offset to start parameters
 663     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 664     int next_off = st_off - Interpreter::stackElementSize;
 665 
 666     // Say 4 args:
 667     // i   st_off
 668     // 0   32 T_LONG
 669     // 1   24 T_VOID
 670     // 2   16 T_OBJECT
 671     // 3    8 T_BOOL
 672     // -    0 return address
 673     //
 674     // However to make thing extra confusing. Because we can fit a long/double in
 675     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 676     // leaves one slot empty and only stores to a single slot. In this case the
 677     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 678 
 679     VMReg r_1 = regs[i].first();
 680     VMReg r_2 = regs[i].second();
 681     if (!r_1->is_valid()) {
 682       assert(!r_2->is_valid(), "");
 683       continue;
 684     }
 685     if (r_1->is_stack()) {
 686       // memory to memory use rax
 687       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 688       if (!r_2->is_valid()) {
 689         // sign extend??
 690         __ movl(rax, Address(rsp, ld_off));
 691         __ movptr(Address(rsp, st_off), rax);
 692 
 693       } else {
 694 
 695         __ movq(rax, Address(rsp, ld_off));
 696 
 697         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 698         // T_DOUBLE and T_LONG use two slots in the interpreter
 699         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 700           // ld_off == LSW, ld_off+wordSize == MSW
 701           // st_off == MSW, next_off == LSW
 702           __ movq(Address(rsp, next_off), rax);
 703 #ifdef ASSERT
 704           // Overwrite the unused slot with known junk
 705           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 706           __ movptr(Address(rsp, st_off), rax);
 707 #endif /* ASSERT */
 708         } else {
 709           __ movq(Address(rsp, st_off), rax);
 710         }
 711       }
 712     } else if (r_1->is_Register()) {
 713       Register r = r_1->as_Register();
 714       if (!r_2->is_valid()) {
 715         // must be only an int (or less ) so move only 32bits to slot
 716         // why not sign extend??
 717         __ movl(Address(rsp, st_off), r);
 718       } else {
 719         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 720         // T_DOUBLE and T_LONG use two slots in the interpreter
 721         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 722           // long/double in gpr
 723 #ifdef ASSERT
 724           // Overwrite the unused slot with known junk
 725           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 726           __ movptr(Address(rsp, st_off), rax);
 727 #endif /* ASSERT */
 728           __ movq(Address(rsp, next_off), r);
 729         } else {
 730           __ movptr(Address(rsp, st_off), r);
 731         }
 732       }
 733     } else {
 734       assert(r_1->is_XMMRegister(), "");
 735       if (!r_2->is_valid()) {
 736         // only a float use just part of the slot
 737         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 738       } else {
 739 #ifdef ASSERT
 740         // Overwrite the unused slot with known junk
 741         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 742         __ movptr(Address(rsp, st_off), rax);
 743 #endif /* ASSERT */
 744         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 745       }
 746     }
 747   }
 748 
 749   // Schedule the branch target address early.
 750   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 751   __ jmp(rcx);
 752 }
 753 
 754 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 755                         address code_start, address code_end,
 756                         Label& L_ok) {
 757   Label L_fail;
 758   __ lea(temp_reg, ExternalAddress(code_start));
 759   __ cmpptr(pc_reg, temp_reg);
 760   __ jcc(Assembler::belowEqual, L_fail);
 761   __ lea(temp_reg, ExternalAddress(code_end));
 762   __ cmpptr(pc_reg, temp_reg);
 763   __ jcc(Assembler::below, L_ok);
 764   __ bind(L_fail);
 765 }
 766 
 767 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 768                                     int total_args_passed,
 769                                     int comp_args_on_stack,
 770                                     const BasicType *sig_bt,
 771                                     const VMRegPair *regs) {
 772 
 773   // Note: r13 contains the senderSP on entry. We must preserve it since
 774   // we may do a i2c -> c2i transition if we lose a race where compiled
 775   // code goes non-entrant while we get args ready.
 776   // In addition we use r13 to locate all the interpreter args as
 777   // we must align the stack to 16 bytes on an i2c entry else we
 778   // lose alignment we expect in all compiled code and register
 779   // save code can segv when fxsave instructions find improperly
 780   // aligned stack pointer.
 781 
 782   // Adapters can be frameless because they do not require the caller
 783   // to perform additional cleanup work, such as correcting the stack pointer.
 784   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 785   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 786   // even if a callee has modified the stack pointer.
 787   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 788   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 789   // up via the senderSP register).
 790   // In other words, if *either* the caller or callee is interpreted, we can
 791   // get the stack pointer repaired after a call.
 792   // This is why c2i and i2c adapters cannot be indefinitely composed.
 793   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 794   // both caller and callee would be compiled methods, and neither would
 795   // clean up the stack pointer changes performed by the two adapters.
 796   // If this happens, control eventually transfers back to the compiled
 797   // caller, but with an uncorrected stack, causing delayed havoc.
 798 
 799   if (VerifyAdapterCalls &&
 800       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 801     // So, let's test for cascading c2i/i2c adapters right now.
 802     //  assert(Interpreter::contains($return_addr) ||
 803     //         StubRoutines::contains($return_addr),
 804     //         "i2c adapter must return to an interpreter frame");
 805     __ block_comment("verify_i2c { ");
 806     // Pick up the return address
 807     __ movptr(rax, Address(rsp, 0));
 808     Label L_ok;
 809     if (Interpreter::code() != nullptr) {
 810       range_check(masm, rax, r11,
 811                   Interpreter::code()->code_start(),
 812                   Interpreter::code()->code_end(),
 813                   L_ok);
 814     }
 815     if (StubRoutines::initial_stubs_code() != nullptr) {
 816       range_check(masm, rax, r11,
 817                   StubRoutines::initial_stubs_code()->code_begin(),
 818                   StubRoutines::initial_stubs_code()->code_end(),
 819                   L_ok);
 820     }
 821     if (StubRoutines::final_stubs_code() != nullptr) {
 822       range_check(masm, rax, r11,
 823                   StubRoutines::final_stubs_code()->code_begin(),
 824                   StubRoutines::final_stubs_code()->code_end(),
 825                   L_ok);
 826     }
 827     const char* msg = "i2c adapter must return to an interpreter frame";
 828     __ block_comment(msg);
 829     __ stop(msg);
 830     __ bind(L_ok);
 831     __ block_comment("} verify_i2ce ");
 832   }
 833 
 834   // Must preserve original SP for loading incoming arguments because
 835   // we need to align the outgoing SP for compiled code.
 836   __ movptr(r11, rsp);
 837 
 838   // Pick up the return address
 839   __ pop(rax);
 840 
 841   // Convert 4-byte c2 stack slots to words.
 842   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 843 
 844   if (comp_args_on_stack) {
 845     __ subptr(rsp, comp_words_on_stack * wordSize);
 846   }
 847 
 848   // Ensure compiled code always sees stack at proper alignment
 849   __ andptr(rsp, -16);
 850 
 851   // push the return address and misalign the stack that youngest frame always sees
 852   // as far as the placement of the call instruction
 853   __ push(rax);
 854 
 855   // Put saved SP in another register
 856   const Register saved_sp = rax;
 857   __ movptr(saved_sp, r11);
 858 
 859   // Will jump to the compiled code just as if compiled code was doing it.
 860   // Pre-load the register-jump target early, to schedule it better.
 861   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 862 
 863 #if INCLUDE_JVMCI
 864   if (EnableJVMCI) {
 865     // check if this call should be routed towards a specific entry point
 866     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 867     Label no_alternative_target;
 868     __ jcc(Assembler::equal, no_alternative_target);
 869     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 870     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 871     __ bind(no_alternative_target);
 872   }
 873 #endif // INCLUDE_JVMCI
 874 
 875   // Now generate the shuffle code.  Pick up all register args and move the
 876   // rest through the floating point stack top.
 877   for (int i = 0; i < total_args_passed; i++) {
 878     if (sig_bt[i] == T_VOID) {
 879       // Longs and doubles are passed in native word order, but misaligned
 880       // in the 32-bit build.
 881       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 882       continue;
 883     }
 884 
 885     // Pick up 0, 1 or 2 words from SP+offset.
 886 
 887     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 888             "scrambled load targets?");
 889     // Load in argument order going down.
 890     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 891     // Point to interpreter value (vs. tag)
 892     int next_off = ld_off - Interpreter::stackElementSize;
 893     //
 894     //
 895     //
 896     VMReg r_1 = regs[i].first();
 897     VMReg r_2 = regs[i].second();
 898     if (!r_1->is_valid()) {
 899       assert(!r_2->is_valid(), "");
 900       continue;
 901     }
 902     if (r_1->is_stack()) {
 903       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 904       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 905 
 906       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 907       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 908       // will be generated.
 909       if (!r_2->is_valid()) {
 910         // sign extend???
 911         __ movl(r13, Address(saved_sp, ld_off));
 912         __ movptr(Address(rsp, st_off), r13);
 913       } else {
 914         //
 915         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 916         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 917         // So we must adjust where to pick up the data to match the interpreter.
 918         //
 919         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 920         // are accessed as negative so LSW is at LOW address
 921 
 922         // ld_off is MSW so get LSW
 923         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 924                            next_off : ld_off;
 925         __ movq(r13, Address(saved_sp, offset));
 926         // st_off is LSW (i.e. reg.first())
 927         __ movq(Address(rsp, st_off), r13);
 928       }
 929     } else if (r_1->is_Register()) {  // Register argument
 930       Register r = r_1->as_Register();
 931       assert(r != rax, "must be different");
 932       if (r_2->is_valid()) {
 933         //
 934         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 935         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 936         // So we must adjust where to pick up the data to match the interpreter.
 937 
 938         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 939                            next_off : ld_off;
 940 
 941         // this can be a misaligned move
 942         __ movq(r, Address(saved_sp, offset));
 943       } else {
 944         // sign extend and use a full word?
 945         __ movl(r, Address(saved_sp, ld_off));
 946       }
 947     } else {
 948       if (!r_2->is_valid()) {
 949         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 950       } else {
 951         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 952       }
 953     }
 954   }
 955 
 956   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 957 
 958   // 6243940 We might end up in handle_wrong_method if
 959   // the callee is deoptimized as we race thru here. If that
 960   // happens we don't want to take a safepoint because the
 961   // caller frame will look interpreted and arguments are now
 962   // "compiled" so it is much better to make this transition
 963   // invisible to the stack walking code. Unfortunately if
 964   // we try and find the callee by normal means a safepoint
 965   // is possible. So we stash the desired callee in the thread
 966   // and the vm will find there should this case occur.
 967 
 968   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 969 
 970   // put Method* where a c2i would expect should we end up there
 971   // only needed because eof c2 resolve stubs return Method* as a result in
 972   // rax
 973   __ mov(rax, rbx);
 974   __ jmp(r11);
 975 }
 976 
 977 // ---------------------------------------------------------------
 978 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 979                                                             int total_args_passed,
 980                                                             int comp_args_on_stack,
 981                                                             const BasicType *sig_bt,
 982                                                             const VMRegPair *regs,
 983                                                             AdapterFingerPrint* fingerprint) {
 984   address i2c_entry = __ pc();
 985 
 986   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 987 
 988   // -------------------------------------------------------------------------
 989   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 990   // to the interpreter.  The args start out packed in the compiled layout.  They
 991   // need to be unpacked into the interpreter layout.  This will almost always
 992   // require some stack space.  We grow the current (compiled) stack, then repack
 993   // the args.  We  finally end in a jump to the generic interpreter entry point.
 994   // On exit from the interpreter, the interpreter will restore our SP (lest the
 995   // compiled code, which relies solely on SP and not RBP, get sick).
 996 
 997   address c2i_unverified_entry = __ pc();
 998   Label skip_fixup;
 999   Label ok;
1000 
1001   Register holder = rax;
1002   Register receiver = j_rarg0;
1003   Register temp = rbx;
1004 
1005   {
1006     __ load_klass(temp, receiver, rscratch1);
1007     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
1008     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
1009     __ jcc(Assembler::equal, ok);
1010     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1011 
1012     __ bind(ok);
1013     // Method might have been compiled since the call site was patched to
1014     // interpreted if that is the case treat it as a miss so we can get
1015     // the call site corrected.
1016     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1017     __ jcc(Assembler::equal, skip_fixup);
1018     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1019   }
1020 
1021   address c2i_entry = __ pc();
1022 
1023   // Class initialization barrier for static methods
1024   address c2i_no_clinit_check_entry = nullptr;
1025   if (VM_Version::supports_fast_class_init_checks()) {
1026     Label L_skip_barrier;
1027     Register method = rbx;
1028 
1029     { // Bypass the barrier for non-static methods
1030       Register flags = rscratch1;
1031       __ movl(flags, Address(method, Method::access_flags_offset()));
1032       __ testl(flags, JVM_ACC_STATIC);
1033       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1034     }
1035 
1036     Register klass = rscratch1;
1037     __ load_method_holder(klass, method);
1038     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1039 
1040     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1041 
1042     __ bind(L_skip_barrier);
1043     c2i_no_clinit_check_entry = __ pc();
1044   }
1045 
1046   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1047   bs->c2i_entry_barrier(masm);
1048 
1049   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1050 
1051   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1052 }
1053 
1054 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1055                                          VMRegPair *regs,
1056                                          int total_args_passed) {
1057 
1058 // We return the amount of VMRegImpl stack slots we need to reserve for all
1059 // the arguments NOT counting out_preserve_stack_slots.
1060 
1061 // NOTE: These arrays will have to change when c1 is ported
1062 #ifdef _WIN64
1063     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1064       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1065     };
1066     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1067       c_farg0, c_farg1, c_farg2, c_farg3
1068     };
1069 #else
1070     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1071       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1072     };
1073     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1074       c_farg0, c_farg1, c_farg2, c_farg3,
1075       c_farg4, c_farg5, c_farg6, c_farg7
1076     };
1077 #endif // _WIN64
1078 
1079 
1080     uint int_args = 0;
1081     uint fp_args = 0;
1082     uint stk_args = 0; // inc by 2 each time
1083 
1084     for (int i = 0; i < total_args_passed; i++) {
1085       switch (sig_bt[i]) {
1086       case T_BOOLEAN:
1087       case T_CHAR:
1088       case T_BYTE:
1089       case T_SHORT:
1090       case T_INT:
1091         if (int_args < Argument::n_int_register_parameters_c) {
1092           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1093 #ifdef _WIN64
1094           fp_args++;
1095           // Allocate slots for callee to stuff register args the stack.
1096           stk_args += 2;
1097 #endif
1098         } else {
1099           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1100           stk_args += 2;
1101         }
1102         break;
1103       case T_LONG:
1104         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1105         // fall through
1106       case T_OBJECT:
1107       case T_ARRAY:
1108       case T_ADDRESS:
1109       case T_METADATA:
1110         if (int_args < Argument::n_int_register_parameters_c) {
1111           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1112 #ifdef _WIN64
1113           fp_args++;
1114           stk_args += 2;
1115 #endif
1116         } else {
1117           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1118           stk_args += 2;
1119         }
1120         break;
1121       case T_FLOAT:
1122         if (fp_args < Argument::n_float_register_parameters_c) {
1123           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1124 #ifdef _WIN64
1125           int_args++;
1126           // Allocate slots for callee to stuff register args the stack.
1127           stk_args += 2;
1128 #endif
1129         } else {
1130           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1131           stk_args += 2;
1132         }
1133         break;
1134       case T_DOUBLE:
1135         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1136         if (fp_args < Argument::n_float_register_parameters_c) {
1137           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1138 #ifdef _WIN64
1139           int_args++;
1140           // Allocate slots for callee to stuff register args the stack.
1141           stk_args += 2;
1142 #endif
1143         } else {
1144           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1145           stk_args += 2;
1146         }
1147         break;
1148       case T_VOID: // Halves of longs and doubles
1149         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1150         regs[i].set_bad();
1151         break;
1152       default:
1153         ShouldNotReachHere();
1154         break;
1155       }
1156     }
1157 #ifdef _WIN64
1158   // windows abi requires that we always allocate enough stack space
1159   // for 4 64bit registers to be stored down.
1160   if (stk_args < 8) {
1161     stk_args = 8;
1162   }
1163 #endif // _WIN64
1164 
1165   return stk_args;
1166 }
1167 
1168 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1169                                              uint num_bits,
1170                                              uint total_args_passed) {
1171   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1172          "only certain vector sizes are supported for now");
1173 
1174   static const XMMRegister VEC_ArgReg[32] = {
1175      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1176      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1177     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1178     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1179   };
1180 
1181   uint stk_args = 0;
1182   uint fp_args = 0;
1183 
1184   for (uint i = 0; i < total_args_passed; i++) {
1185     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1186     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1187     regs[i].set_pair(vmreg->next(next_val), vmreg);
1188   }
1189 
1190   return stk_args;
1191 }
1192 
1193 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1194   // We always ignore the frame_slots arg and just use the space just below frame pointer
1195   // which by this time is free to use
1196   switch (ret_type) {
1197   case T_FLOAT:
1198     __ movflt(Address(rbp, -wordSize), xmm0);
1199     break;
1200   case T_DOUBLE:
1201     __ movdbl(Address(rbp, -wordSize), xmm0);
1202     break;
1203   case T_VOID:  break;
1204   default: {
1205     __ movptr(Address(rbp, -wordSize), rax);
1206     }
1207   }
1208 }
1209 
1210 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1211   // We always ignore the frame_slots arg and just use the space just below frame pointer
1212   // which by this time is free to use
1213   switch (ret_type) {
1214   case T_FLOAT:
1215     __ movflt(xmm0, Address(rbp, -wordSize));
1216     break;
1217   case T_DOUBLE:
1218     __ movdbl(xmm0, Address(rbp, -wordSize));
1219     break;
1220   case T_VOID:  break;
1221   default: {
1222     __ movptr(rax, Address(rbp, -wordSize));
1223     }
1224   }
1225 }
1226 
1227 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1228     for ( int i = first_arg ; i < arg_count ; i++ ) {
1229       if (args[i].first()->is_Register()) {
1230         __ push(args[i].first()->as_Register());
1231       } else if (args[i].first()->is_XMMRegister()) {
1232         __ subptr(rsp, 2*wordSize);
1233         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1234       }
1235     }
1236 }
1237 
1238 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1239     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1240       if (args[i].first()->is_Register()) {
1241         __ pop(args[i].first()->as_Register());
1242       } else if (args[i].first()->is_XMMRegister()) {
1243         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1244         __ addptr(rsp, 2*wordSize);
1245       }
1246     }
1247 }
1248 
1249 static void verify_oop_args(MacroAssembler* masm,
1250                             const methodHandle& method,
1251                             const BasicType* sig_bt,
1252                             const VMRegPair* regs) {
1253   Register temp_reg = rbx;  // not part of any compiled calling seq
1254   if (VerifyOops) {
1255     for (int i = 0; i < method->size_of_parameters(); i++) {
1256       if (is_reference_type(sig_bt[i])) {
1257         VMReg r = regs[i].first();
1258         assert(r->is_valid(), "bad oop arg");
1259         if (r->is_stack()) {
1260           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1261           __ verify_oop(temp_reg);
1262         } else {
1263           __ verify_oop(r->as_Register());
1264         }
1265       }
1266     }
1267   }
1268 }
1269 
1270 static void check_continuation_enter_argument(VMReg actual_vmreg,
1271                                               Register expected_reg,
1272                                               const char* name) {
1273   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1274   assert(actual_vmreg->as_Register() == expected_reg,
1275          "%s is in unexpected register: %s instead of %s",
1276          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1277 }
1278 
1279 
1280 //---------------------------- continuation_enter_setup ---------------------------
1281 //
1282 // Arguments:
1283 //   None.
1284 //
1285 // Results:
1286 //   rsp: pointer to blank ContinuationEntry
1287 //
1288 // Kills:
1289 //   rax
1290 //
1291 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1292   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1293   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1294   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1295 
1296   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1297   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1298 
1299   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1300   OopMap* map = new OopMap(frame_size, 0);
1301 
1302   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1303   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1304   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1305 
1306   return map;
1307 }
1308 
1309 //---------------------------- fill_continuation_entry ---------------------------
1310 //
1311 // Arguments:
1312 //   rsp: pointer to blank Continuation entry
1313 //   reg_cont_obj: pointer to the continuation
1314 //   reg_flags: flags
1315 //
1316 // Results:
1317 //   rsp: pointer to filled out ContinuationEntry
1318 //
1319 // Kills:
1320 //   rax
1321 //
1322 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1323   assert_different_registers(rax, reg_cont_obj, reg_flags);
1324 #ifdef ASSERT
1325   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1326 #endif
1327   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1328   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1329   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1330   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1331   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1332 
1333   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1334   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1335   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1336   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1337 
1338   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1339   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1340 }
1341 
1342 //---------------------------- continuation_enter_cleanup ---------------------------
1343 //
1344 // Arguments:
1345 //   rsp: pointer to the ContinuationEntry
1346 //
1347 // Results:
1348 //   rsp: pointer to the spilled rbp in the entry frame
1349 //
1350 // Kills:
1351 //   rbx
1352 //
1353 void static continuation_enter_cleanup(MacroAssembler* masm) {
1354 #ifdef ASSERT
1355   Label L_good_sp;
1356   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1357   __ jcc(Assembler::equal, L_good_sp);
1358   __ stop("Incorrect rsp at continuation_enter_cleanup");
1359   __ bind(L_good_sp);
1360 #endif
1361 
1362   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1363   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1364   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1365   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1366 
1367   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1368   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1369   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1370 }
1371 
1372 static void gen_continuation_enter(MacroAssembler* masm,
1373                                    const VMRegPair* regs,
1374                                    int& exception_offset,
1375                                    OopMapSet* oop_maps,
1376                                    int& frame_complete,
1377                                    int& stack_slots,
1378                                    int& interpreted_entry_offset,
1379                                    int& compiled_entry_offset) {
1380 
1381   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1382   int pos_cont_obj   = 0;
1383   int pos_is_cont    = 1;
1384   int pos_is_virtual = 2;
1385 
1386   // The platform-specific calling convention may present the arguments in various registers.
1387   // To simplify the rest of the code, we expect the arguments to reside at these known
1388   // registers, and we additionally check the placement here in case calling convention ever
1389   // changes.
1390   Register reg_cont_obj   = c_rarg1;
1391   Register reg_is_cont    = c_rarg2;
1392   Register reg_is_virtual = c_rarg3;
1393 
1394   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1395   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1396   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1397 
1398   // Utility methods kill rax, make sure there are no collisions
1399   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1400 
1401   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1402                          relocInfo::static_call_type);
1403 
1404   address start = __ pc();
1405 
1406   Label L_thaw, L_exit;
1407 
1408   // i2i entry used at interp_only_mode only
1409   interpreted_entry_offset = __ pc() - start;
1410   {
1411 #ifdef ASSERT
1412     Label is_interp_only;
1413     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1414     __ jcc(Assembler::notEqual, is_interp_only);
1415     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1416     __ bind(is_interp_only);
1417 #endif
1418 
1419     __ pop(rax); // return address
1420     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1421     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1422     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1423     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1424     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1425     __ push(rax); // return address
1426     __ push_cont_fastpath();
1427 
1428     __ enter();
1429 
1430     stack_slots = 2; // will be adjusted in setup
1431     OopMap* map = continuation_enter_setup(masm, stack_slots);
1432     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1433     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1434 
1435     __ verify_oop(reg_cont_obj);
1436 
1437     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1438 
1439     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1440     __ testptr(reg_is_cont, reg_is_cont);
1441     __ jcc(Assembler::notZero, L_thaw);
1442 
1443     // --- Resolve path
1444 
1445     // Make sure the call is patchable
1446     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1447     // Emit stub for static call
1448     CodeBuffer* cbuf = masm->code_section()->outer();
1449     address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1450     if (stub == nullptr) {
1451       fatal("CodeCache is full at gen_continuation_enter");
1452     }
1453     __ call(resolve);
1454     oop_maps->add_gc_map(__ pc() - start, map);
1455     __ post_call_nop();
1456 
1457     __ jmp(L_exit);
1458   }
1459 
1460   // compiled entry
1461   __ align(CodeEntryAlignment);
1462   compiled_entry_offset = __ pc() - start;
1463   __ enter();
1464 
1465   stack_slots = 2; // will be adjusted in setup
1466   OopMap* map = continuation_enter_setup(masm, stack_slots);
1467 
1468   // Frame is now completed as far as size and linkage.
1469   frame_complete = __ pc() - start;
1470 
1471   __ verify_oop(reg_cont_obj);
1472 
1473   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1474 
1475   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1476   __ testptr(reg_is_cont, reg_is_cont);
1477   __ jccb(Assembler::notZero, L_thaw);
1478 
1479   // --- call Continuation.enter(Continuation c, boolean isContinue)
1480 
1481   // Make sure the call is patchable
1482   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1483 
1484   // Emit stub for static call
1485   CodeBuffer* cbuf = masm->code_section()->outer();
1486   address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1487   if (stub == nullptr) {
1488     fatal("CodeCache is full at gen_continuation_enter");
1489   }
1490 
1491   // The call needs to be resolved. There's a special case for this in
1492   // SharedRuntime::find_callee_info_helper() which calls
1493   // LinkResolver::resolve_continuation_enter() which resolves the call to
1494   // Continuation.enter(Continuation c, boolean isContinue).
1495   __ call(resolve);
1496 
1497   oop_maps->add_gc_map(__ pc() - start, map);
1498   __ post_call_nop();
1499 
1500   __ jmpb(L_exit);
1501 
1502   // --- Thawing path
1503 
1504   __ bind(L_thaw);
1505 
1506   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1507 
1508   ContinuationEntry::_return_pc_offset = __ pc() - start;
1509   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1510   __ post_call_nop();
1511 
1512   // --- Normal exit (resolve/thawing)
1513 
1514   __ bind(L_exit);
1515 
1516   continuation_enter_cleanup(masm);
1517   __ pop(rbp);
1518   __ ret(0);
1519 
1520   // --- Exception handling path
1521 
1522   exception_offset = __ pc() - start;
1523 
1524   continuation_enter_cleanup(masm);
1525   __ pop(rbp);
1526 
1527   __ movptr(c_rarg0, r15_thread);
1528   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1529 
1530   // rax still holds the original exception oop, save it before the call
1531   __ push(rax);
1532 
1533   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1534   __ movptr(rbx, rax);
1535 
1536   // Continue at exception handler:
1537   //   rax: exception oop
1538   //   rbx: exception handler
1539   //   rdx: exception pc
1540   __ pop(rax);
1541   __ verify_oop(rax);
1542   __ pop(rdx);
1543   __ jmp(rbx);
1544 }
1545 
1546 static void gen_continuation_yield(MacroAssembler* masm,
1547                                    const VMRegPair* regs,
1548                                    OopMapSet* oop_maps,
1549                                    int& frame_complete,
1550                                    int& stack_slots,
1551                                    int& compiled_entry_offset) {
1552   enum layout {
1553     rbp_off,
1554     rbpH_off,
1555     return_off,
1556     return_off2,
1557     framesize // inclusive of return address
1558   };
1559   stack_slots = framesize /  VMRegImpl::slots_per_word;
1560   assert(stack_slots == 2, "recheck layout");
1561 
1562   address start = __ pc();
1563   compiled_entry_offset = __ pc() - start;
1564   __ enter();
1565   address the_pc = __ pc();
1566 
1567   frame_complete = the_pc - start;
1568 
1569   // This nop must be exactly at the PC we push into the frame info.
1570   // We use this nop for fast CodeBlob lookup, associate the OopMap
1571   // with it right away.
1572   __ post_call_nop();
1573   OopMap* map = new OopMap(framesize, 1);
1574   oop_maps->add_gc_map(frame_complete, map);
1575 
1576   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1577   __ movptr(c_rarg0, r15_thread);
1578   __ movptr(c_rarg1, rsp);
1579   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1580   __ reset_last_Java_frame(true);
1581 
1582   Label L_pinned;
1583 
1584   __ testptr(rax, rax);
1585   __ jcc(Assembler::notZero, L_pinned);
1586 
1587   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1588   continuation_enter_cleanup(masm);
1589   __ pop(rbp);
1590   __ ret(0);
1591 
1592   __ bind(L_pinned);
1593 
1594   // Pinned, return to caller
1595 
1596   // handle pending exception thrown by freeze
1597   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1598   Label ok;
1599   __ jcc(Assembler::equal, ok);
1600   __ leave();
1601   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1602   __ bind(ok);
1603 
1604   __ leave();
1605   __ ret(0);
1606 }
1607 
1608 static void gen_special_dispatch(MacroAssembler* masm,
1609                                  const methodHandle& method,
1610                                  const BasicType* sig_bt,
1611                                  const VMRegPair* regs) {
1612   verify_oop_args(masm, method, sig_bt, regs);
1613   vmIntrinsics::ID iid = method->intrinsic_id();
1614 
1615   // Now write the args into the outgoing interpreter space
1616   bool     has_receiver   = false;
1617   Register receiver_reg   = noreg;
1618   int      member_arg_pos = -1;
1619   Register member_reg     = noreg;
1620   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1621   if (ref_kind != 0) {
1622     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1623     member_reg = rbx;  // known to be free at this point
1624     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1625   } else if (iid == vmIntrinsics::_invokeBasic) {
1626     has_receiver = true;
1627   } else if (iid == vmIntrinsics::_linkToNative) {
1628     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1629     member_reg = rbx;  // known to be free at this point
1630   } else {
1631     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1632   }
1633 
1634   if (member_reg != noreg) {
1635     // Load the member_arg into register, if necessary.
1636     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1637     VMReg r = regs[member_arg_pos].first();
1638     if (r->is_stack()) {
1639       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1640     } else {
1641       // no data motion is needed
1642       member_reg = r->as_Register();
1643     }
1644   }
1645 
1646   if (has_receiver) {
1647     // Make sure the receiver is loaded into a register.
1648     assert(method->size_of_parameters() > 0, "oob");
1649     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1650     VMReg r = regs[0].first();
1651     assert(r->is_valid(), "bad receiver arg");
1652     if (r->is_stack()) {
1653       // Porting note:  This assumes that compiled calling conventions always
1654       // pass the receiver oop in a register.  If this is not true on some
1655       // platform, pick a temp and load the receiver from stack.
1656       fatal("receiver always in a register");
1657       receiver_reg = j_rarg0;  // known to be free at this point
1658       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1659     } else {
1660       // no data motion is needed
1661       receiver_reg = r->as_Register();
1662     }
1663   }
1664 
1665   // Figure out which address we are really jumping to:
1666   MethodHandles::generate_method_handle_dispatch(masm, iid,
1667                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1668 }
1669 
1670 // ---------------------------------------------------------------------------
1671 // Generate a native wrapper for a given method.  The method takes arguments
1672 // in the Java compiled code convention, marshals them to the native
1673 // convention (handlizes oops, etc), transitions to native, makes the call,
1674 // returns to java state (possibly blocking), unhandlizes any result and
1675 // returns.
1676 //
1677 // Critical native functions are a shorthand for the use of
1678 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1679 // functions.  The wrapper is expected to unpack the arguments before
1680 // passing them to the callee. Critical native functions leave the state _in_Java,
1681 // since they cannot stop for GC.
1682 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1683 // block and the check for pending exceptions it's impossible for them
1684 // to be thrown.
1685 //
1686 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1687                                                 const methodHandle& method,
1688                                                 int compile_id,
1689                                                 BasicType* in_sig_bt,
1690                                                 VMRegPair* in_regs,
1691                                                 BasicType ret_type) {
1692   if (method->is_continuation_native_intrinsic()) {
1693     int exception_offset = -1;
1694     OopMapSet* oop_maps = new OopMapSet();
1695     int frame_complete = -1;
1696     int stack_slots = -1;
1697     int interpreted_entry_offset = -1;
1698     int vep_offset = -1;
1699     if (method->is_continuation_enter_intrinsic()) {
1700       gen_continuation_enter(masm,
1701                              in_regs,
1702                              exception_offset,
1703                              oop_maps,
1704                              frame_complete,
1705                              stack_slots,
1706                              interpreted_entry_offset,
1707                              vep_offset);
1708     } else if (method->is_continuation_yield_intrinsic()) {
1709       gen_continuation_yield(masm,
1710                              in_regs,
1711                              oop_maps,
1712                              frame_complete,
1713                              stack_slots,
1714                              vep_offset);
1715     } else {
1716       guarantee(false, "Unknown Continuation native intrinsic");
1717     }
1718 
1719 #ifdef ASSERT
1720     if (method->is_continuation_enter_intrinsic()) {
1721       assert(interpreted_entry_offset != -1, "Must be set");
1722       assert(exception_offset != -1,         "Must be set");
1723     } else {
1724       assert(interpreted_entry_offset == -1, "Must be unset");
1725       assert(exception_offset == -1,         "Must be unset");
1726     }
1727     assert(frame_complete != -1,    "Must be set");
1728     assert(stack_slots != -1,       "Must be set");
1729     assert(vep_offset != -1,        "Must be set");
1730 #endif
1731 
1732     __ flush();
1733     nmethod* nm = nmethod::new_native_nmethod(method,
1734                                               compile_id,
1735                                               masm->code(),
1736                                               vep_offset,
1737                                               frame_complete,
1738                                               stack_slots,
1739                                               in_ByteSize(-1),
1740                                               in_ByteSize(-1),
1741                                               oop_maps,
1742                                               exception_offset);
1743     if (method->is_continuation_enter_intrinsic()) {
1744       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1745     } else if (method->is_continuation_yield_intrinsic()) {
1746       _cont_doYield_stub = nm;
1747     }
1748     return nm;
1749   }
1750 
1751   if (method->is_method_handle_intrinsic()) {
1752     vmIntrinsics::ID iid = method->intrinsic_id();
1753     intptr_t start = (intptr_t)__ pc();
1754     int vep_offset = ((intptr_t)__ pc()) - start;
1755     gen_special_dispatch(masm,
1756                          method,
1757                          in_sig_bt,
1758                          in_regs);
1759     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1760     __ flush();
1761     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1762     return nmethod::new_native_nmethod(method,
1763                                        compile_id,
1764                                        masm->code(),
1765                                        vep_offset,
1766                                        frame_complete,
1767                                        stack_slots / VMRegImpl::slots_per_word,
1768                                        in_ByteSize(-1),
1769                                        in_ByteSize(-1),
1770                                        nullptr);
1771   }
1772   address native_func = method->native_function();
1773   assert(native_func != nullptr, "must have function");
1774 
1775   // An OopMap for lock (and class if static)
1776   OopMapSet *oop_maps = new OopMapSet();
1777   intptr_t start = (intptr_t)__ pc();
1778 
1779   // We have received a description of where all the java arg are located
1780   // on entry to the wrapper. We need to convert these args to where
1781   // the jni function will expect them. To figure out where they go
1782   // we convert the java signature to a C signature by inserting
1783   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1784 
1785   const int total_in_args = method->size_of_parameters();
1786   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1787 
1788   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1789   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1790   BasicType* in_elem_bt = nullptr;
1791 
1792   int argc = 0;
1793   out_sig_bt[argc++] = T_ADDRESS;
1794   if (method->is_static()) {
1795     out_sig_bt[argc++] = T_OBJECT;
1796   }
1797 
1798   for (int i = 0; i < total_in_args ; i++ ) {
1799     out_sig_bt[argc++] = in_sig_bt[i];
1800   }
1801 
1802   // Now figure out where the args must be stored and how much stack space
1803   // they require.
1804   int out_arg_slots;
1805   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1806 
1807   // Compute framesize for the wrapper.  We need to handlize all oops in
1808   // incoming registers
1809 
1810   // Calculate the total number of stack slots we will need.
1811 
1812   // First count the abi requirement plus all of the outgoing args
1813   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1814 
1815   // Now the space for the inbound oop handle area
1816   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1817 
1818   int oop_handle_offset = stack_slots;
1819   stack_slots += total_save_slots;
1820 
1821   // Now any space we need for handlizing a klass if static method
1822 
1823   int klass_slot_offset = 0;
1824   int klass_offset = -1;
1825   int lock_slot_offset = 0;
1826   bool is_static = false;
1827 
1828   if (method->is_static()) {
1829     klass_slot_offset = stack_slots;
1830     stack_slots += VMRegImpl::slots_per_word;
1831     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1832     is_static = true;
1833   }
1834 
1835   // Plus a lock if needed
1836 
1837   if (method->is_synchronized()) {
1838     lock_slot_offset = stack_slots;
1839     stack_slots += VMRegImpl::slots_per_word;
1840   }
1841 
1842   // Now a place (+2) to save return values or temp during shuffling
1843   // + 4 for return address (which we own) and saved rbp
1844   stack_slots += 6;
1845 
1846   // Ok The space we have allocated will look like:
1847   //
1848   //
1849   // FP-> |                     |
1850   //      |---------------------|
1851   //      | 2 slots for moves   |
1852   //      |---------------------|
1853   //      | lock box (if sync)  |
1854   //      |---------------------| <- lock_slot_offset
1855   //      | klass (if static)   |
1856   //      |---------------------| <- klass_slot_offset
1857   //      | oopHandle area      |
1858   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1859   //      | outbound memory     |
1860   //      | based arguments     |
1861   //      |                     |
1862   //      |---------------------|
1863   //      |                     |
1864   // SP-> | out_preserved_slots |
1865   //
1866   //
1867 
1868 
1869   // Now compute actual number of stack words we need rounding to make
1870   // stack properly aligned.
1871   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1872 
1873   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1874 
1875   // First thing make an ic check to see if we should even be here
1876 
1877   // We are free to use all registers as temps without saving them and
1878   // restoring them except rbp. rbp is the only callee save register
1879   // as far as the interpreter and the compiler(s) are concerned.
1880 
1881 
1882   const Register ic_reg = rax;
1883   const Register receiver = j_rarg0;
1884 
1885   Label hit;
1886   Label exception_pending;
1887 
1888   assert_different_registers(ic_reg, receiver, rscratch1, rscratch2);
1889   __ verify_oop(receiver);
1890   __ load_klass(rscratch1, receiver, rscratch2);
1891   __ cmpq(ic_reg, rscratch1);
1892   __ jcc(Assembler::equal, hit);
1893 
1894   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1895 
1896   // Verified entry point must be aligned
1897   __ align(8);
1898 
1899   __ bind(hit);
1900 
1901   int vep_offset = ((intptr_t)__ pc()) - start;
1902 
1903   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1904     Label L_skip_barrier;
1905     Register klass = r10;
1906     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1907     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1908 
1909     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1910 
1911     __ bind(L_skip_barrier);
1912   }
1913 
1914 #ifdef COMPILER1
1915   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1916   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1917     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1918   }
1919 #endif // COMPILER1
1920 
1921   // The instruction at the verified entry point must be 5 bytes or longer
1922   // because it can be patched on the fly by make_non_entrant. The stack bang
1923   // instruction fits that requirement.
1924 
1925   // Generate stack overflow check
1926   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1927 
1928   // Generate a new frame for the wrapper.
1929   __ enter();
1930   // -2 because return address is already present and so is saved rbp
1931   __ subptr(rsp, stack_size - 2*wordSize);
1932 
1933   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1934   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1935   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1936 
1937   // Frame is now completed as far as size and linkage.
1938   int frame_complete = ((intptr_t)__ pc()) - start;
1939 
1940     if (UseRTMLocking) {
1941       // Abort RTM transaction before calling JNI
1942       // because critical section will be large and will be
1943       // aborted anyway. Also nmethod could be deoptimized.
1944       __ xabort(0);
1945     }
1946 
1947 #ifdef ASSERT
1948   __ check_stack_alignment(rsp, "improperly aligned stack");
1949 #endif /* ASSERT */
1950 
1951 
1952   // We use r14 as the oop handle for the receiver/klass
1953   // It is callee save so it survives the call to native
1954 
1955   const Register oop_handle_reg = r14;
1956 
1957   //
1958   // We immediately shuffle the arguments so that any vm call we have to
1959   // make from here on out (sync slow path, jvmti, etc.) we will have
1960   // captured the oops from our caller and have a valid oopMap for
1961   // them.
1962 
1963   // -----------------
1964   // The Grand Shuffle
1965 
1966   // The Java calling convention is either equal (linux) or denser (win64) than the
1967   // c calling convention. However the because of the jni_env argument the c calling
1968   // convention always has at least one more (and two for static) arguments than Java.
1969   // Therefore if we move the args from java -> c backwards then we will never have
1970   // a register->register conflict and we don't have to build a dependency graph
1971   // and figure out how to break any cycles.
1972   //
1973 
1974   // Record esp-based slot for receiver on stack for non-static methods
1975   int receiver_offset = -1;
1976 
1977   // This is a trick. We double the stack slots so we can claim
1978   // the oops in the caller's frame. Since we are sure to have
1979   // more args than the caller doubling is enough to make
1980   // sure we can capture all the incoming oop args from the
1981   // caller.
1982   //
1983   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1984 
1985   // Mark location of rbp (someday)
1986   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1987 
1988   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1989   // All inbound args are referenced based on rbp and all outbound args via rsp.
1990 
1991 
1992 #ifdef ASSERT
1993   bool reg_destroyed[Register::number_of_registers];
1994   bool freg_destroyed[XMMRegister::number_of_registers];
1995   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1996     reg_destroyed[r] = false;
1997   }
1998   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1999     freg_destroyed[f] = false;
2000   }
2001 
2002 #endif /* ASSERT */
2003 
2004   // For JNI natives the incoming and outgoing registers are offset upwards.
2005   GrowableArray<int> arg_order(2 * total_in_args);
2006 
2007   VMRegPair tmp_vmreg;
2008   tmp_vmreg.set2(rbx->as_VMReg());
2009 
2010   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2011     arg_order.push(i);
2012     arg_order.push(c_arg);
2013   }
2014 
2015   int temploc = -1;
2016   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2017     int i = arg_order.at(ai);
2018     int c_arg = arg_order.at(ai + 1);
2019     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2020 #ifdef ASSERT
2021     if (in_regs[i].first()->is_Register()) {
2022       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2023     } else if (in_regs[i].first()->is_XMMRegister()) {
2024       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2025     }
2026     if (out_regs[c_arg].first()->is_Register()) {
2027       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2028     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2029       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2030     }
2031 #endif /* ASSERT */
2032     switch (in_sig_bt[i]) {
2033       case T_ARRAY:
2034       case T_OBJECT:
2035         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2036                     ((i == 0) && (!is_static)),
2037                     &receiver_offset);
2038         break;
2039       case T_VOID:
2040         break;
2041 
2042       case T_FLOAT:
2043         __ float_move(in_regs[i], out_regs[c_arg]);
2044           break;
2045 
2046       case T_DOUBLE:
2047         assert( i + 1 < total_in_args &&
2048                 in_sig_bt[i + 1] == T_VOID &&
2049                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2050         __ double_move(in_regs[i], out_regs[c_arg]);
2051         break;
2052 
2053       case T_LONG :
2054         __ long_move(in_regs[i], out_regs[c_arg]);
2055         break;
2056 
2057       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2058 
2059       default:
2060         __ move32_64(in_regs[i], out_regs[c_arg]);
2061     }
2062   }
2063 
2064   int c_arg;
2065 
2066   // Pre-load a static method's oop into r14.  Used both by locking code and
2067   // the normal JNI call code.
2068   // point c_arg at the first arg that is already loaded in case we
2069   // need to spill before we call out
2070   c_arg = total_c_args - total_in_args;
2071 
2072   if (method->is_static()) {
2073 
2074     //  load oop into a register
2075     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2076 
2077     // Now handlize the static class mirror it's known not-null.
2078     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2079     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2080 
2081     // Now get the handle
2082     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2083     // store the klass handle as second argument
2084     __ movptr(c_rarg1, oop_handle_reg);
2085     // and protect the arg if we must spill
2086     c_arg--;
2087   }
2088 
2089   // Change state to native (we save the return address in the thread, since it might not
2090   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2091   // points into the right code segment. It does not have to be the correct return pc.
2092   // We use the same pc/oopMap repeatedly when we call out
2093 
2094   intptr_t the_pc = (intptr_t) __ pc();
2095   oop_maps->add_gc_map(the_pc - start, map);
2096 
2097   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2098 
2099 
2100   // We have all of the arguments setup at this point. We must not touch any register
2101   // argument registers at this point (what if we save/restore them there are no oop?
2102 
2103   {
2104     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2105     // protect the args we've loaded
2106     save_args(masm, total_c_args, c_arg, out_regs);
2107     __ mov_metadata(c_rarg1, method());
2108     __ call_VM_leaf(
2109       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2110       r15_thread, c_rarg1);
2111     restore_args(masm, total_c_args, c_arg, out_regs);
2112   }
2113 
2114   // RedefineClasses() tracing support for obsolete method entry
2115   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2116     // protect the args we've loaded
2117     save_args(masm, total_c_args, c_arg, out_regs);
2118     __ mov_metadata(c_rarg1, method());
2119     __ call_VM_leaf(
2120       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2121       r15_thread, c_rarg1);
2122     restore_args(masm, total_c_args, c_arg, out_regs);
2123   }
2124 
2125   // Lock a synchronized method
2126 
2127   // Register definitions used by locking and unlocking
2128 
2129   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2130   const Register obj_reg  = rbx;  // Will contain the oop
2131   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2132   const Register old_hdr  = r13;  // value of old header at unlock time
2133 
2134   Label slow_path_lock;
2135   Label lock_done;
2136 
2137   if (method->is_synchronized()) {
2138     Label count_mon;
2139 
2140     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2141 
2142     // Get the handle (the 2nd argument)
2143     __ mov(oop_handle_reg, c_rarg1);
2144 
2145     // Get address of the box
2146 
2147     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2148 
2149     // Load the oop from the handle
2150     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2151 
2152     if (LockingMode == LM_MONITOR) {
2153       __ jmp(slow_path_lock);
2154     } else if (LockingMode == LM_LEGACY) {
2155       // Load immediate 1 into swap_reg %rax
2156       __ movl(swap_reg, 1);
2157 
2158       // Load (object->mark() | 1) into swap_reg %rax
2159       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2160 
2161       // Save (object->mark() | 1) into BasicLock's displaced header
2162       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2163 
2164       // src -> dest iff dest == rax else rax <- dest
2165       __ lock();
2166       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2167       __ jcc(Assembler::equal, count_mon);
2168 
2169       // Hmm should this move to the slow path code area???
2170 
2171       // Test if the oopMark is an obvious stack pointer, i.e.,
2172       //  1) (mark & 3) == 0, and
2173       //  2) rsp <= mark < mark + os::pagesize()
2174       // These 3 tests can be done by evaluating the following
2175       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2176       // assuming both stack pointer and pagesize have their
2177       // least significant 2 bits clear.
2178       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2179 
2180       __ subptr(swap_reg, rsp);
2181       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2182 
2183       // Save the test result, for recursive case, the result is zero
2184       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2185       __ jcc(Assembler::notEqual, slow_path_lock);
2186     } else {
2187       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2188       // Load object header
2189       __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2190       __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2191     }
2192     __ bind(count_mon);
2193     __ inc_held_monitor_count();
2194 
2195     // Slow path will re-enter here
2196     __ bind(lock_done);
2197   }
2198 
2199   // Finally just about ready to make the JNI call
2200 
2201   // get JNIEnv* which is first argument to native
2202   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2203 
2204   // Now set thread in native
2205   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2206 
2207   __ call(RuntimeAddress(native_func));
2208 
2209   // Verify or restore cpu control state after JNI call
2210   __ restore_cpu_control_state_after_jni(rscratch1);
2211 
2212   // Unpack native results.
2213   switch (ret_type) {
2214   case T_BOOLEAN: __ c2bool(rax);            break;
2215   case T_CHAR   : __ movzwl(rax, rax);      break;
2216   case T_BYTE   : __ sign_extend_byte (rax); break;
2217   case T_SHORT  : __ sign_extend_short(rax); break;
2218   case T_INT    : /* nothing to do */        break;
2219   case T_DOUBLE :
2220   case T_FLOAT  :
2221     // Result is in xmm0 we'll save as needed
2222     break;
2223   case T_ARRAY:                 // Really a handle
2224   case T_OBJECT:                // Really a handle
2225       break; // can't de-handlize until after safepoint check
2226   case T_VOID: break;
2227   case T_LONG: break;
2228   default       : ShouldNotReachHere();
2229   }
2230 
2231   Label after_transition;
2232 
2233   // Switch thread to "native transition" state before reading the synchronization state.
2234   // This additional state is necessary because reading and testing the synchronization
2235   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2236   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2237   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2238   //     Thread A is resumed to finish this native method, but doesn't block here since it
2239   //     didn't see any synchronization is progress, and escapes.
2240   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2241 
2242   // Force this write out before the read below
2243   if (!UseSystemMemoryBarrier) {
2244     __ membar(Assembler::Membar_mask_bits(
2245               Assembler::LoadLoad | Assembler::LoadStore |
2246               Assembler::StoreLoad | Assembler::StoreStore));
2247   }
2248 
2249   // check for safepoint operation in progress and/or pending suspend requests
2250   {
2251     Label Continue;
2252     Label slow_path;
2253 
2254     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2255 
2256     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2257     __ jcc(Assembler::equal, Continue);
2258     __ bind(slow_path);
2259 
2260     // Don't use call_VM as it will see a possible pending exception and forward it
2261     // and never return here preventing us from clearing _last_native_pc down below.
2262     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2263     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2264     // by hand.
2265     //
2266     __ vzeroupper();
2267     save_native_result(masm, ret_type, stack_slots);
2268     __ mov(c_rarg0, r15_thread);
2269     __ mov(r12, rsp); // remember sp
2270     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2271     __ andptr(rsp, -16); // align stack as required by ABI
2272     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2273     __ mov(rsp, r12); // restore sp
2274     __ reinit_heapbase();
2275     // Restore any method result value
2276     restore_native_result(masm, ret_type, stack_slots);
2277     __ bind(Continue);
2278   }
2279 
2280   // change thread state
2281   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2282   __ bind(after_transition);
2283 
2284   Label reguard;
2285   Label reguard_done;
2286   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2287   __ jcc(Assembler::equal, reguard);
2288   __ bind(reguard_done);
2289 
2290   // native result if any is live
2291 
2292   // Unlock
2293   Label slow_path_unlock;
2294   Label unlock_done;
2295   if (method->is_synchronized()) {
2296 
2297     Label fast_done;
2298 
2299     // Get locked oop from the handle we passed to jni
2300     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2301 
2302     if (LockingMode == LM_LEGACY) {
2303       Label not_recur;
2304       // Simple recursive lock?
2305       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2306       __ jcc(Assembler::notEqual, not_recur);
2307       __ dec_held_monitor_count();
2308       __ jmpb(fast_done);
2309       __ bind(not_recur);
2310     }
2311 
2312     // Must save rax if it is live now because cmpxchg must use it
2313     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2314       save_native_result(masm, ret_type, stack_slots);
2315     }
2316 
2317     if (LockingMode == LM_MONITOR) {
2318       __ jmp(slow_path_unlock);
2319     } else if (LockingMode == LM_LEGACY) {
2320       // get address of the stack lock
2321       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2322       //  get old displaced header
2323       __ movptr(old_hdr, Address(rax, 0));
2324 
2325       // Atomic swap old header if oop still contains the stack lock
2326       __ lock();
2327       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2328       __ jcc(Assembler::notEqual, slow_path_unlock);
2329       __ dec_held_monitor_count();
2330     } else {
2331       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2332       __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2333       __ andptr(swap_reg, ~(int32_t)markWord::lock_mask_in_place);
2334       __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2335       __ dec_held_monitor_count();
2336     }
2337 
2338     // slow path re-enters here
2339     __ bind(unlock_done);
2340     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2341       restore_native_result(masm, ret_type, stack_slots);
2342     }
2343 
2344     __ bind(fast_done);
2345   }
2346   {
2347     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2348     save_native_result(masm, ret_type, stack_slots);
2349     __ mov_metadata(c_rarg1, method());
2350     __ call_VM_leaf(
2351          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2352          r15_thread, c_rarg1);
2353     restore_native_result(masm, ret_type, stack_slots);
2354   }
2355 
2356   __ reset_last_Java_frame(false);
2357 
2358   // Unbox oop result, e.g. JNIHandles::resolve value.
2359   if (is_reference_type(ret_type)) {
2360     __ resolve_jobject(rax /* value */,
2361                        r15_thread /* thread */,
2362                        rcx /* tmp */);
2363   }
2364 
2365   if (CheckJNICalls) {
2366     // clear_pending_jni_exception_check
2367     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2368   }
2369 
2370   // reset handle block
2371   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2372   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2373 
2374   // pop our frame
2375 
2376   __ leave();
2377 
2378   // Any exception pending?
2379   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2380   __ jcc(Assembler::notEqual, exception_pending);
2381 
2382   // Return
2383 
2384   __ ret(0);
2385 
2386   // Unexpected paths are out of line and go here
2387 
2388   // forward the exception
2389   __ bind(exception_pending);
2390 
2391   // and forward the exception
2392   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2393 
2394   // Slow path locking & unlocking
2395   if (method->is_synchronized()) {
2396 
2397     // BEGIN Slow path lock
2398     __ bind(slow_path_lock);
2399 
2400     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2401     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2402 
2403     // protect the args we've loaded
2404     save_args(masm, total_c_args, c_arg, out_regs);
2405 
2406     __ mov(c_rarg0, obj_reg);
2407     __ mov(c_rarg1, lock_reg);
2408     __ mov(c_rarg2, r15_thread);
2409 
2410     // Not a leaf but we have last_Java_frame setup as we want
2411     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2412     restore_args(masm, total_c_args, c_arg, out_regs);
2413 
2414 #ifdef ASSERT
2415     { Label L;
2416     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2417     __ jcc(Assembler::equal, L);
2418     __ stop("no pending exception allowed on exit from monitorenter");
2419     __ bind(L);
2420     }
2421 #endif
2422     __ jmp(lock_done);
2423 
2424     // END Slow path lock
2425 
2426     // BEGIN Slow path unlock
2427     __ bind(slow_path_unlock);
2428 
2429     // If we haven't already saved the native result we must save it now as xmm registers
2430     // are still exposed.
2431     __ vzeroupper();
2432     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2433       save_native_result(masm, ret_type, stack_slots);
2434     }
2435 
2436     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2437 
2438     __ mov(c_rarg0, obj_reg);
2439     __ mov(c_rarg2, r15_thread);
2440     __ mov(r12, rsp); // remember sp
2441     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2442     __ andptr(rsp, -16); // align stack as required by ABI
2443 
2444     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2445     // NOTE that obj_reg == rbx currently
2446     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2447     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2448 
2449     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2450     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2451     __ mov(rsp, r12); // restore sp
2452     __ reinit_heapbase();
2453 #ifdef ASSERT
2454     {
2455       Label L;
2456       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2457       __ jcc(Assembler::equal, L);
2458       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2459       __ bind(L);
2460     }
2461 #endif /* ASSERT */
2462 
2463     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2464 
2465     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2466       restore_native_result(masm, ret_type, stack_slots);
2467     }
2468     __ jmp(unlock_done);
2469 
2470     // END Slow path unlock
2471 
2472   } // synchronized
2473 
2474   // SLOW PATH Reguard the stack if needed
2475 
2476   __ bind(reguard);
2477   __ vzeroupper();
2478   save_native_result(masm, ret_type, stack_slots);
2479   __ mov(r12, rsp); // remember sp
2480   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2481   __ andptr(rsp, -16); // align stack as required by ABI
2482   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2483   __ mov(rsp, r12); // restore sp
2484   __ reinit_heapbase();
2485   restore_native_result(masm, ret_type, stack_slots);
2486   // and continue
2487   __ jmp(reguard_done);
2488 
2489 
2490 
2491   __ flush();
2492 
2493   nmethod *nm = nmethod::new_native_nmethod(method,
2494                                             compile_id,
2495                                             masm->code(),
2496                                             vep_offset,
2497                                             frame_complete,
2498                                             stack_slots / VMRegImpl::slots_per_word,
2499                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2500                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2501                                             oop_maps);
2502 
2503   return nm;
2504 }
2505 
2506 // this function returns the adjust size (in number of words) to a c2i adapter
2507 // activation for use during deoptimization
2508 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2509   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2510 }
2511 
2512 
2513 uint SharedRuntime::out_preserve_stack_slots() {
2514   return 0;
2515 }
2516 
2517 
2518 // Number of stack slots between incoming argument block and the start of
2519 // a new frame.  The PROLOG must add this many slots to the stack.  The
2520 // EPILOG must remove this many slots.  amd64 needs two slots for
2521 // return address.
2522 uint SharedRuntime::in_preserve_stack_slots() {
2523   return 4 + 2 * VerifyStackAtCalls;
2524 }
2525 
2526 //------------------------------generate_deopt_blob----------------------------
2527 void SharedRuntime::generate_deopt_blob() {
2528   // Allocate space for the code
2529   ResourceMark rm;
2530   // Setup code generation tools
2531   int pad = 0;
2532   if (UseAVX > 2) {
2533     pad += 1024;
2534   }
2535 #if INCLUDE_JVMCI
2536   if (EnableJVMCI) {
2537     pad += 512; // Increase the buffer size when compiling for JVMCI
2538   }
2539 #endif
2540   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2541   MacroAssembler* masm = new MacroAssembler(&buffer);
2542   int frame_size_in_words;
2543   OopMap* map = nullptr;
2544   OopMapSet *oop_maps = new OopMapSet();
2545 
2546   // -------------
2547   // This code enters when returning to a de-optimized nmethod.  A return
2548   // address has been pushed on the stack, and return values are in
2549   // registers.
2550   // If we are doing a normal deopt then we were called from the patched
2551   // nmethod from the point we returned to the nmethod. So the return
2552   // address on the stack is wrong by NativeCall::instruction_size
2553   // We will adjust the value so it looks like we have the original return
2554   // address on the stack (like when we eagerly deoptimized).
2555   // In the case of an exception pending when deoptimizing, we enter
2556   // with a return address on the stack that points after the call we patched
2557   // into the exception handler. We have the following register state from,
2558   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2559   //    rax: exception oop
2560   //    rbx: exception handler
2561   //    rdx: throwing pc
2562   // So in this case we simply jam rdx into the useless return address and
2563   // the stack looks just like we want.
2564   //
2565   // At this point we need to de-opt.  We save the argument return
2566   // registers.  We call the first C routine, fetch_unroll_info().  This
2567   // routine captures the return values and returns a structure which
2568   // describes the current frame size and the sizes of all replacement frames.
2569   // The current frame is compiled code and may contain many inlined
2570   // functions, each with their own JVM state.  We pop the current frame, then
2571   // push all the new frames.  Then we call the C routine unpack_frames() to
2572   // populate these frames.  Finally unpack_frames() returns us the new target
2573   // address.  Notice that callee-save registers are BLOWN here; they have
2574   // already been captured in the vframeArray at the time the return PC was
2575   // patched.
2576   address start = __ pc();
2577   Label cont;
2578 
2579   // Prolog for non exception case!
2580 
2581   // Save everything in sight.
2582   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2583 
2584   // Normal deoptimization.  Save exec mode for unpack_frames.
2585   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2586   __ jmp(cont);
2587 
2588   int reexecute_offset = __ pc() - start;
2589 #if INCLUDE_JVMCI && !defined(COMPILER1)
2590   if (EnableJVMCI && UseJVMCICompiler) {
2591     // JVMCI does not use this kind of deoptimization
2592     __ should_not_reach_here();
2593   }
2594 #endif
2595 
2596   // Reexecute case
2597   // return address is the pc describes what bci to do re-execute at
2598 
2599   // No need to update map as each call to save_live_registers will produce identical oopmap
2600   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2601 
2602   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2603   __ jmp(cont);
2604 
2605 #if INCLUDE_JVMCI
2606   Label after_fetch_unroll_info_call;
2607   int implicit_exception_uncommon_trap_offset = 0;
2608   int uncommon_trap_offset = 0;
2609 
2610   if (EnableJVMCI) {
2611     implicit_exception_uncommon_trap_offset = __ pc() - start;
2612 
2613     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2614     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2615 
2616     uncommon_trap_offset = __ pc() - start;
2617 
2618     // Save everything in sight.
2619     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2620     // fetch_unroll_info needs to call last_java_frame()
2621     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2622 
2623     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2624     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2625 
2626     __ movl(r14, Deoptimization::Unpack_reexecute);
2627     __ mov(c_rarg0, r15_thread);
2628     __ movl(c_rarg2, r14); // exec mode
2629     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2630     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2631 
2632     __ reset_last_Java_frame(false);
2633 
2634     __ jmp(after_fetch_unroll_info_call);
2635   } // EnableJVMCI
2636 #endif // INCLUDE_JVMCI
2637 
2638   int exception_offset = __ pc() - start;
2639 
2640   // Prolog for exception case
2641 
2642   // all registers are dead at this entry point, except for rax, and
2643   // rdx which contain the exception oop and exception pc
2644   // respectively.  Set them in TLS and fall thru to the
2645   // unpack_with_exception_in_tls entry point.
2646 
2647   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2648   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2649 
2650   int exception_in_tls_offset = __ pc() - start;
2651 
2652   // new implementation because exception oop is now passed in JavaThread
2653 
2654   // Prolog for exception case
2655   // All registers must be preserved because they might be used by LinearScan
2656   // Exceptiop oop and throwing PC are passed in JavaThread
2657   // tos: stack at point of call to method that threw the exception (i.e. only
2658   // args are on the stack, no return address)
2659 
2660   // make room on stack for the return address
2661   // It will be patched later with the throwing pc. The correct value is not
2662   // available now because loading it from memory would destroy registers.
2663   __ push(0);
2664 
2665   // Save everything in sight.
2666   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2667 
2668   // Now it is safe to overwrite any register
2669 
2670   // Deopt during an exception.  Save exec mode for unpack_frames.
2671   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2672 
2673   // load throwing pc from JavaThread and patch it as the return address
2674   // of the current frame. Then clear the field in JavaThread
2675 
2676   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2677   __ movptr(Address(rbp, wordSize), rdx);
2678   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2679 
2680 #ifdef ASSERT
2681   // verify that there is really an exception oop in JavaThread
2682   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2683   __ verify_oop(rax);
2684 
2685   // verify that there is no pending exception
2686   Label no_pending_exception;
2687   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2688   __ testptr(rax, rax);
2689   __ jcc(Assembler::zero, no_pending_exception);
2690   __ stop("must not have pending exception here");
2691   __ bind(no_pending_exception);
2692 #endif
2693 
2694   __ bind(cont);
2695 
2696   // Call C code.  Need thread and this frame, but NOT official VM entry
2697   // crud.  We cannot block on this call, no GC can happen.
2698   //
2699   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2700 
2701   // fetch_unroll_info needs to call last_java_frame().
2702 
2703   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2704 #ifdef ASSERT
2705   { Label L;
2706     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2707     __ jcc(Assembler::equal, L);
2708     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2709     __ bind(L);
2710   }
2711 #endif // ASSERT
2712   __ mov(c_rarg0, r15_thread);
2713   __ movl(c_rarg1, r14); // exec_mode
2714   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2715 
2716   // Need to have an oopmap that tells fetch_unroll_info where to
2717   // find any register it might need.
2718   oop_maps->add_gc_map(__ pc() - start, map);
2719 
2720   __ reset_last_Java_frame(false);
2721 
2722 #if INCLUDE_JVMCI
2723   if (EnableJVMCI) {
2724     __ bind(after_fetch_unroll_info_call);
2725   }
2726 #endif
2727 
2728   // Load UnrollBlock* into rdi
2729   __ mov(rdi, rax);
2730 
2731   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2732    Label noException;
2733   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2734   __ jcc(Assembler::notEqual, noException);
2735   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2736   // QQQ this is useless it was null above
2737   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2738   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2739   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2740 
2741   __ verify_oop(rax);
2742 
2743   // Overwrite the result registers with the exception results.
2744   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2745   // I think this is useless
2746   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2747 
2748   __ bind(noException);
2749 
2750   // Only register save data is on the stack.
2751   // Now restore the result registers.  Everything else is either dead
2752   // or captured in the vframeArray.
2753   RegisterSaver::restore_result_registers(masm);
2754 
2755   // All of the register save area has been popped of the stack. Only the
2756   // return address remains.
2757 
2758   // Pop all the frames we must move/replace.
2759   //
2760   // Frame picture (youngest to oldest)
2761   // 1: self-frame (no frame link)
2762   // 2: deopting frame  (no frame link)
2763   // 3: caller of deopting frame (could be compiled/interpreted).
2764   //
2765   // Note: by leaving the return address of self-frame on the stack
2766   // and using the size of frame 2 to adjust the stack
2767   // when we are done the return to frame 3 will still be on the stack.
2768 
2769   // Pop deoptimized frame
2770   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2771   __ addptr(rsp, rcx);
2772 
2773   // rsp should be pointing at the return address to the caller (3)
2774 
2775   // Pick up the initial fp we should save
2776   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2777   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2778 
2779 #ifdef ASSERT
2780   // Compilers generate code that bang the stack by as much as the
2781   // interpreter would need. So this stack banging should never
2782   // trigger a fault. Verify that it does not on non product builds.
2783   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2784   __ bang_stack_size(rbx, rcx);
2785 #endif
2786 
2787   // Load address of array of frame pcs into rcx
2788   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2789 
2790   // Trash the old pc
2791   __ addptr(rsp, wordSize);
2792 
2793   // Load address of array of frame sizes into rsi
2794   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2795 
2796   // Load counter into rdx
2797   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2798 
2799   // Now adjust the caller's stack to make up for the extra locals
2800   // but record the original sp so that we can save it in the skeletal interpreter
2801   // frame and the stack walking of interpreter_sender will get the unextended sp
2802   // value and not the "real" sp value.
2803 
2804   const Register sender_sp = r8;
2805 
2806   __ mov(sender_sp, rsp);
2807   __ movl(rbx, Address(rdi,
2808                        Deoptimization::UnrollBlock::
2809                        caller_adjustment_offset()));
2810   __ subptr(rsp, rbx);
2811 
2812   // Push interpreter frames in a loop
2813   Label loop;
2814   __ bind(loop);
2815   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2816   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2817   __ pushptr(Address(rcx, 0));          // Save return address
2818   __ enter();                           // Save old & set new ebp
2819   __ subptr(rsp, rbx);                  // Prolog
2820   // This value is corrected by layout_activation_impl
2821   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2822   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2823   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2824   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2825   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2826   __ decrementl(rdx);                   // Decrement counter
2827   __ jcc(Assembler::notZero, loop);
2828   __ pushptr(Address(rcx, 0));          // Save final return address
2829 
2830   // Re-push self-frame
2831   __ enter();                           // Save old & set new ebp
2832 
2833   // Allocate a full sized register save area.
2834   // Return address and rbp are in place, so we allocate two less words.
2835   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2836 
2837   // Restore frame locals after moving the frame
2838   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2839   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2840 
2841   // Call C code.  Need thread but NOT official VM entry
2842   // crud.  We cannot block on this call, no GC can happen.  Call should
2843   // restore return values to their stack-slots with the new SP.
2844   //
2845   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2846 
2847   // Use rbp because the frames look interpreted now
2848   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2849   // Don't need the precise return PC here, just precise enough to point into this code blob.
2850   address the_pc = __ pc();
2851   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2852 
2853   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2854   __ mov(c_rarg0, r15_thread);
2855   __ movl(c_rarg1, r14); // second arg: exec_mode
2856   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2857   // Revert SP alignment after call since we're going to do some SP relative addressing below
2858   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2859 
2860   // Set an oopmap for the call site
2861   // Use the same PC we used for the last java frame
2862   oop_maps->add_gc_map(the_pc - start,
2863                        new OopMap( frame_size_in_words, 0 ));
2864 
2865   // Clear fp AND pc
2866   __ reset_last_Java_frame(true);
2867 
2868   // Collect return values
2869   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2870   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2871   // I think this is useless (throwing pc?)
2872   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2873 
2874   // Pop self-frame.
2875   __ leave();                           // Epilog
2876 
2877   // Jump to interpreter
2878   __ ret(0);
2879 
2880   // Make sure all code is generated
2881   masm->flush();
2882 
2883   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2884   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2885 #if INCLUDE_JVMCI
2886   if (EnableJVMCI) {
2887     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2888     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2889   }
2890 #endif
2891 }
2892 
2893 #ifdef COMPILER2
2894 //------------------------------generate_uncommon_trap_blob--------------------
2895 void SharedRuntime::generate_uncommon_trap_blob() {
2896   // Allocate space for the code
2897   ResourceMark rm;
2898   // Setup code generation tools
2899   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2900   MacroAssembler* masm = new MacroAssembler(&buffer);
2901 
2902   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2903 
2904   address start = __ pc();
2905 
2906   if (UseRTMLocking) {
2907     // Abort RTM transaction before possible nmethod deoptimization.
2908     __ xabort(0);
2909   }
2910 
2911   // Push self-frame.  We get here with a return address on the
2912   // stack, so rsp is 8-byte aligned until we allocate our frame.
2913   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2914 
2915   // No callee saved registers. rbp is assumed implicitly saved
2916   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2917 
2918   // compiler left unloaded_class_index in j_rarg0 move to where the
2919   // runtime expects it.
2920   __ movl(c_rarg1, j_rarg0);
2921 
2922   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2923 
2924   // Call C code.  Need thread but NOT official VM entry
2925   // crud.  We cannot block on this call, no GC can happen.  Call should
2926   // capture callee-saved registers as well as return values.
2927   // Thread is in rdi already.
2928   //
2929   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2930 
2931   __ mov(c_rarg0, r15_thread);
2932   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2933   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2934 
2935   // Set an oopmap for the call site
2936   OopMapSet* oop_maps = new OopMapSet();
2937   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2938 
2939   // location of rbp is known implicitly by the frame sender code
2940 
2941   oop_maps->add_gc_map(__ pc() - start, map);
2942 
2943   __ reset_last_Java_frame(false);
2944 
2945   // Load UnrollBlock* into rdi
2946   __ mov(rdi, rax);
2947 
2948 #ifdef ASSERT
2949   { Label L;
2950     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
2951               Deoptimization::Unpack_uncommon_trap);
2952     __ jcc(Assembler::equal, L);
2953     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
2954     __ bind(L);
2955   }
2956 #endif
2957 
2958   // Pop all the frames we must move/replace.
2959   //
2960   // Frame picture (youngest to oldest)
2961   // 1: self-frame (no frame link)
2962   // 2: deopting frame  (no frame link)
2963   // 3: caller of deopting frame (could be compiled/interpreted).
2964 
2965   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2966   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2967 
2968   // Pop deoptimized frame (int)
2969   __ movl(rcx, Address(rdi,
2970                        Deoptimization::UnrollBlock::
2971                        size_of_deoptimized_frame_offset()));
2972   __ addptr(rsp, rcx);
2973 
2974   // rsp should be pointing at the return address to the caller (3)
2975 
2976   // Pick up the initial fp we should save
2977   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2978   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2979 
2980 #ifdef ASSERT
2981   // Compilers generate code that bang the stack by as much as the
2982   // interpreter would need. So this stack banging should never
2983   // trigger a fault. Verify that it does not on non product builds.
2984   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2985   __ bang_stack_size(rbx, rcx);
2986 #endif
2987 
2988   // Load address of array of frame pcs into rcx (address*)
2989   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2990 
2991   // Trash the return pc
2992   __ addptr(rsp, wordSize);
2993 
2994   // Load address of array of frame sizes into rsi (intptr_t*)
2995   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
2996 
2997   // Counter
2998   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
2999 
3000   // Now adjust the caller's stack to make up for the extra locals but
3001   // record the original sp so that we can save it in the skeletal
3002   // interpreter frame and the stack walking of interpreter_sender
3003   // will get the unextended sp value and not the "real" sp value.
3004 
3005   const Register sender_sp = r8;
3006 
3007   __ mov(sender_sp, rsp);
3008   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
3009   __ subptr(rsp, rbx);
3010 
3011   // Push interpreter frames in a loop
3012   Label loop;
3013   __ bind(loop);
3014   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3015   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3016   __ pushptr(Address(rcx, 0));     // Save return address
3017   __ enter();                      // Save old & set new rbp
3018   __ subptr(rsp, rbx);             // Prolog
3019   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3020             sender_sp);            // Make it walkable
3021   // This value is corrected by layout_activation_impl
3022   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3023   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3024   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3025   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3026   __ decrementl(rdx);              // Decrement counter
3027   __ jcc(Assembler::notZero, loop);
3028   __ pushptr(Address(rcx, 0));     // Save final return address
3029 
3030   // Re-push self-frame
3031   __ enter();                 // Save old & set new rbp
3032   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3033                               // Prolog
3034 
3035   // Use rbp because the frames look interpreted now
3036   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3037   // Don't need the precise return PC here, just precise enough to point into this code blob.
3038   address the_pc = __ pc();
3039   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3040 
3041   // Call C code.  Need thread but NOT official VM entry
3042   // crud.  We cannot block on this call, no GC can happen.  Call should
3043   // restore return values to their stack-slots with the new SP.
3044   // Thread is in rdi already.
3045   //
3046   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3047 
3048   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3049   __ mov(c_rarg0, r15_thread);
3050   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3051   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3052 
3053   // Set an oopmap for the call site
3054   // Use the same PC we used for the last java frame
3055   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3056 
3057   // Clear fp AND pc
3058   __ reset_last_Java_frame(true);
3059 
3060   // Pop self-frame.
3061   __ leave();                 // Epilog
3062 
3063   // Jump to interpreter
3064   __ ret(0);
3065 
3066   // Make sure all code is generated
3067   masm->flush();
3068 
3069   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3070                                                  SimpleRuntimeFrame::framesize >> 1);
3071 }
3072 #endif // COMPILER2
3073 
3074 //------------------------------generate_handler_blob------
3075 //
3076 // Generate a special Compile2Runtime blob that saves all registers,
3077 // and setup oopmap.
3078 //
3079 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3080   assert(StubRoutines::forward_exception_entry() != nullptr,
3081          "must be generated before");
3082 
3083   ResourceMark rm;
3084   OopMapSet *oop_maps = new OopMapSet();
3085   OopMap* map;
3086 
3087   // Allocate space for the code.  Setup code generation tools.
3088   CodeBuffer buffer("handler_blob", 2048, 1024);
3089   MacroAssembler* masm = new MacroAssembler(&buffer);
3090 
3091   address start   = __ pc();
3092   address call_pc = nullptr;
3093   int frame_size_in_words;
3094   bool cause_return = (poll_type == POLL_AT_RETURN);
3095   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3096 
3097   if (UseRTMLocking) {
3098     // Abort RTM transaction before calling runtime
3099     // because critical section will be large and will be
3100     // aborted anyway. Also nmethod could be deoptimized.
3101     __ xabort(0);
3102   }
3103 
3104   // Make room for return address (or push it again)
3105   if (!cause_return) {
3106     __ push(rbx);
3107   }
3108 
3109   // Save registers, fpu state, and flags
3110   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3111 
3112   // The following is basically a call_VM.  However, we need the precise
3113   // address of the call in order to generate an oopmap. Hence, we do all the
3114   // work ourselves.
3115 
3116   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3117 
3118   // The return address must always be correct so that frame constructor never
3119   // sees an invalid pc.
3120 
3121   if (!cause_return) {
3122     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3123     // Additionally, rbx is a callee saved register and we can look at it later to determine
3124     // if someone changed the return address for us!
3125     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3126     __ movptr(Address(rbp, wordSize), rbx);
3127   }
3128 
3129   // Do the call
3130   __ mov(c_rarg0, r15_thread);
3131   __ call(RuntimeAddress(call_ptr));
3132 
3133   // Set an oopmap for the call site.  This oopmap will map all
3134   // oop-registers and debug-info registers as callee-saved.  This
3135   // will allow deoptimization at this safepoint to find all possible
3136   // debug-info recordings, as well as let GC find all oops.
3137 
3138   oop_maps->add_gc_map( __ pc() - start, map);
3139 
3140   Label noException;
3141 
3142   __ reset_last_Java_frame(false);
3143 
3144   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3145   __ jcc(Assembler::equal, noException);
3146 
3147   // Exception pending
3148 
3149   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3150 
3151   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3152 
3153   // No exception case
3154   __ bind(noException);
3155 
3156   Label no_adjust;
3157 #ifdef ASSERT
3158   Label bail;
3159 #endif
3160   if (!cause_return) {
3161     Label no_prefix, not_special;
3162 
3163     // If our stashed return pc was modified by the runtime we avoid touching it
3164     __ cmpptr(rbx, Address(rbp, wordSize));
3165     __ jccb(Assembler::notEqual, no_adjust);
3166 
3167     // Skip over the poll instruction.
3168     // See NativeInstruction::is_safepoint_poll()
3169     // Possible encodings:
3170     //      85 00       test   %eax,(%rax)
3171     //      85 01       test   %eax,(%rcx)
3172     //      85 02       test   %eax,(%rdx)
3173     //      85 03       test   %eax,(%rbx)
3174     //      85 06       test   %eax,(%rsi)
3175     //      85 07       test   %eax,(%rdi)
3176     //
3177     //   41 85 00       test   %eax,(%r8)
3178     //   41 85 01       test   %eax,(%r9)
3179     //   41 85 02       test   %eax,(%r10)
3180     //   41 85 03       test   %eax,(%r11)
3181     //   41 85 06       test   %eax,(%r14)
3182     //   41 85 07       test   %eax,(%r15)
3183     //
3184     //      85 04 24    test   %eax,(%rsp)
3185     //   41 85 04 24    test   %eax,(%r12)
3186     //      85 45 00    test   %eax,0x0(%rbp)
3187     //   41 85 45 00    test   %eax,0x0(%r13)
3188 
3189     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3190     __ jcc(Assembler::notEqual, no_prefix);
3191     __ addptr(rbx, 1);
3192     __ bind(no_prefix);
3193 #ifdef ASSERT
3194     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3195 #endif
3196     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3197     // r12/rsp 0x04
3198     // r13/rbp 0x05
3199     __ movzbq(rcx, Address(rbx, 1));
3200     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3201     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3202     __ cmpptr(rcx, 1);
3203     __ jcc(Assembler::above, not_special);
3204     __ addptr(rbx, 1);
3205     __ bind(not_special);
3206 #ifdef ASSERT
3207     // Verify the correct encoding of the poll we're about to skip.
3208     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3209     __ jcc(Assembler::notEqual, bail);
3210     // Mask out the modrm bits
3211     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3212     // rax encodes to 0, so if the bits are nonzero it's incorrect
3213     __ jcc(Assembler::notZero, bail);
3214 #endif
3215     // Adjust return pc forward to step over the safepoint poll instruction
3216     __ addptr(rbx, 2);
3217     __ movptr(Address(rbp, wordSize), rbx);
3218   }
3219 
3220   __ bind(no_adjust);
3221   // Normal exit, restore registers and exit.
3222   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3223   __ ret(0);
3224 
3225 #ifdef ASSERT
3226   __ bind(bail);
3227   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3228 #endif
3229 
3230   // Make sure all code is generated
3231   masm->flush();
3232 
3233   // Fill-out other meta info
3234   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3235 }
3236 
3237 //
3238 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3239 //
3240 // Generate a stub that calls into vm to find out the proper destination
3241 // of a java call. All the argument registers are live at this point
3242 // but since this is generic code we don't know what they are and the caller
3243 // must do any gc of the args.
3244 //
3245 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3246   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3247 
3248   // allocate space for the code
3249   ResourceMark rm;
3250 
3251   CodeBuffer buffer(name, 1200, 512);
3252   MacroAssembler* masm = new MacroAssembler(&buffer);
3253 
3254   int frame_size_in_words;
3255 
3256   OopMapSet *oop_maps = new OopMapSet();
3257   OopMap* map = nullptr;
3258 
3259   int start = __ offset();
3260 
3261   // No need to save vector registers since they are caller-saved anyway.
3262   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3263 
3264   int frame_complete = __ offset();
3265 
3266   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3267 
3268   __ mov(c_rarg0, r15_thread);
3269 
3270   __ call(RuntimeAddress(destination));
3271 
3272 
3273   // Set an oopmap for the call site.
3274   // We need this not only for callee-saved registers, but also for volatile
3275   // registers that the compiler might be keeping live across a safepoint.
3276 
3277   oop_maps->add_gc_map( __ offset() - start, map);
3278 
3279   // rax contains the address we are going to jump to assuming no exception got installed
3280 
3281   // clear last_Java_sp
3282   __ reset_last_Java_frame(false);
3283   // check for pending exceptions
3284   Label pending;
3285   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3286   __ jcc(Assembler::notEqual, pending);
3287 
3288   // get the returned Method*
3289   __ get_vm_result_2(rbx, r15_thread);
3290   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3291 
3292   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3293 
3294   RegisterSaver::restore_live_registers(masm);
3295 
3296   // We are back to the original state on entry and ready to go.
3297 
3298   __ jmp(rax);
3299 
3300   // Pending exception after the safepoint
3301 
3302   __ bind(pending);
3303 
3304   RegisterSaver::restore_live_registers(masm);
3305 
3306   // exception pending => remove activation and forward to exception handler
3307 
3308   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3309 
3310   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3311   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3312 
3313   // -------------
3314   // make sure all code is generated
3315   masm->flush();
3316 
3317   // return the  blob
3318   // frame_size_words or bytes??
3319   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3320 }
3321 
3322 //------------------------------Montgomery multiplication------------------------
3323 //
3324 
3325 #ifndef _WINDOWS
3326 
3327 // Subtract 0:b from carry:a.  Return carry.
3328 static julong
3329 sub(julong a[], julong b[], julong carry, long len) {
3330   long long i = 0, cnt = len;
3331   julong tmp;
3332   asm volatile("clc; "
3333                "0: ; "
3334                "mov (%[b], %[i], 8), %[tmp]; "
3335                "sbb %[tmp], (%[a], %[i], 8); "
3336                "inc %[i]; dec %[cnt]; "
3337                "jne 0b; "
3338                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3339                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3340                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3341                : "memory");
3342   return tmp;
3343 }
3344 
3345 // Multiply (unsigned) Long A by Long B, accumulating the double-
3346 // length result into the accumulator formed of T0, T1, and T2.
3347 #define MACC(A, B, T0, T1, T2)                                  \
3348 do {                                                            \
3349   unsigned long hi, lo;                                         \
3350   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3351            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3352            : "r"(A), "a"(B) : "cc");                            \
3353  } while(0)
3354 
3355 // As above, but add twice the double-length result into the
3356 // accumulator.
3357 #define MACC2(A, B, T0, T1, T2)                                 \
3358 do {                                                            \
3359   unsigned long hi, lo;                                         \
3360   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3361            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3362            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3363            : "r"(A), "a"(B) : "cc");                            \
3364  } while(0)
3365 
3366 #else //_WINDOWS
3367 
3368 static julong
3369 sub(julong a[], julong b[], julong carry, long len) {
3370   long i;
3371   julong tmp;
3372   unsigned char c = 1;
3373   for (i = 0; i < len; i++) {
3374     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3375     a[i] = tmp;
3376   }
3377   c = _addcarry_u64(c, carry, ~0, &tmp);
3378   return tmp;
3379 }
3380 
3381 // Multiply (unsigned) Long A by Long B, accumulating the double-
3382 // length result into the accumulator formed of T0, T1, and T2.
3383 #define MACC(A, B, T0, T1, T2)                          \
3384 do {                                                    \
3385   julong hi, lo;                            \
3386   lo = _umul128(A, B, &hi);                             \
3387   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3388   c = _addcarry_u64(c, hi, T1, &T1);                    \
3389   _addcarry_u64(c, T2, 0, &T2);                         \
3390  } while(0)
3391 
3392 // As above, but add twice the double-length result into the
3393 // accumulator.
3394 #define MACC2(A, B, T0, T1, T2)                         \
3395 do {                                                    \
3396   julong hi, lo;                            \
3397   lo = _umul128(A, B, &hi);                             \
3398   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3399   c = _addcarry_u64(c, hi, T1, &T1);                    \
3400   _addcarry_u64(c, T2, 0, &T2);                         \
3401   c = _addcarry_u64(0, lo, T0, &T0);                    \
3402   c = _addcarry_u64(c, hi, T1, &T1);                    \
3403   _addcarry_u64(c, T2, 0, &T2);                         \
3404  } while(0)
3405 
3406 #endif //_WINDOWS
3407 
3408 // Fast Montgomery multiplication.  The derivation of the algorithm is
3409 // in  A Cryptographic Library for the Motorola DSP56000,
3410 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3411 
3412 static void NOINLINE
3413 montgomery_multiply(julong a[], julong b[], julong n[],
3414                     julong m[], julong inv, int len) {
3415   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3416   int i;
3417 
3418   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3419 
3420   for (i = 0; i < len; i++) {
3421     int j;
3422     for (j = 0; j < i; j++) {
3423       MACC(a[j], b[i-j], t0, t1, t2);
3424       MACC(m[j], n[i-j], t0, t1, t2);
3425     }
3426     MACC(a[i], b[0], t0, t1, t2);
3427     m[i] = t0 * inv;
3428     MACC(m[i], n[0], t0, t1, t2);
3429 
3430     assert(t0 == 0, "broken Montgomery multiply");
3431 
3432     t0 = t1; t1 = t2; t2 = 0;
3433   }
3434 
3435   for (i = len; i < 2*len; i++) {
3436     int j;
3437     for (j = i-len+1; j < len; j++) {
3438       MACC(a[j], b[i-j], t0, t1, t2);
3439       MACC(m[j], n[i-j], t0, t1, t2);
3440     }
3441     m[i-len] = t0;
3442     t0 = t1; t1 = t2; t2 = 0;
3443   }
3444 
3445   while (t0)
3446     t0 = sub(m, n, t0, len);
3447 }
3448 
3449 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3450 // multiplies so it should be up to 25% faster than Montgomery
3451 // multiplication.  However, its loop control is more complex and it
3452 // may actually run slower on some machines.
3453 
3454 static void NOINLINE
3455 montgomery_square(julong a[], julong n[],
3456                   julong m[], julong inv, int len) {
3457   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3458   int i;
3459 
3460   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3461 
3462   for (i = 0; i < len; i++) {
3463     int j;
3464     int end = (i+1)/2;
3465     for (j = 0; j < end; j++) {
3466       MACC2(a[j], a[i-j], t0, t1, t2);
3467       MACC(m[j], n[i-j], t0, t1, t2);
3468     }
3469     if ((i & 1) == 0) {
3470       MACC(a[j], a[j], t0, t1, t2);
3471     }
3472     for (; j < i; j++) {
3473       MACC(m[j], n[i-j], t0, t1, t2);
3474     }
3475     m[i] = t0 * inv;
3476     MACC(m[i], n[0], t0, t1, t2);
3477 
3478     assert(t0 == 0, "broken Montgomery square");
3479 
3480     t0 = t1; t1 = t2; t2 = 0;
3481   }
3482 
3483   for (i = len; i < 2*len; i++) {
3484     int start = i-len+1;
3485     int end = start + (len - start)/2;
3486     int j;
3487     for (j = start; j < end; j++) {
3488       MACC2(a[j], a[i-j], t0, t1, t2);
3489       MACC(m[j], n[i-j], t0, t1, t2);
3490     }
3491     if ((i & 1) == 0) {
3492       MACC(a[j], a[j], t0, t1, t2);
3493     }
3494     for (; j < len; j++) {
3495       MACC(m[j], n[i-j], t0, t1, t2);
3496     }
3497     m[i-len] = t0;
3498     t0 = t1; t1 = t2; t2 = 0;
3499   }
3500 
3501   while (t0)
3502     t0 = sub(m, n, t0, len);
3503 }
3504 
3505 // Swap words in a longword.
3506 static julong swap(julong x) {
3507   return (x << 32) | (x >> 32);
3508 }
3509 
3510 // Copy len longwords from s to d, word-swapping as we go.  The
3511 // destination array is reversed.
3512 static void reverse_words(julong *s, julong *d, int len) {
3513   d += len;
3514   while(len-- > 0) {
3515     d--;
3516     *d = swap(*s);
3517     s++;
3518   }
3519 }
3520 
3521 // The threshold at which squaring is advantageous was determined
3522 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3523 #define MONTGOMERY_SQUARING_THRESHOLD 64
3524 
3525 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3526                                         jint len, jlong inv,
3527                                         jint *m_ints) {
3528   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3529   int longwords = len/2;
3530 
3531   // Make very sure we don't use so much space that the stack might
3532   // overflow.  512 jints corresponds to an 16384-bit integer and
3533   // will use here a total of 8k bytes of stack space.
3534   int divisor = sizeof(julong) * 4;
3535   guarantee(longwords <= 8192 / divisor, "must be");
3536   int total_allocation = longwords * sizeof (julong) * 4;
3537   julong *scratch = (julong *)alloca(total_allocation);
3538 
3539   // Local scratch arrays
3540   julong
3541     *a = scratch + 0 * longwords,
3542     *b = scratch + 1 * longwords,
3543     *n = scratch + 2 * longwords,
3544     *m = scratch + 3 * longwords;
3545 
3546   reverse_words((julong *)a_ints, a, longwords);
3547   reverse_words((julong *)b_ints, b, longwords);
3548   reverse_words((julong *)n_ints, n, longwords);
3549 
3550   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3551 
3552   reverse_words(m, (julong *)m_ints, longwords);
3553 }
3554 
3555 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3556                                       jint len, jlong inv,
3557                                       jint *m_ints) {
3558   assert(len % 2 == 0, "array length in montgomery_square must be even");
3559   int longwords = len/2;
3560 
3561   // Make very sure we don't use so much space that the stack might
3562   // overflow.  512 jints corresponds to an 16384-bit integer and
3563   // will use here a total of 6k bytes of stack space.
3564   int divisor = sizeof(julong) * 3;
3565   guarantee(longwords <= (8192 / divisor), "must be");
3566   int total_allocation = longwords * sizeof (julong) * 3;
3567   julong *scratch = (julong *)alloca(total_allocation);
3568 
3569   // Local scratch arrays
3570   julong
3571     *a = scratch + 0 * longwords,
3572     *n = scratch + 1 * longwords,
3573     *m = scratch + 2 * longwords;
3574 
3575   reverse_words((julong *)a_ints, a, longwords);
3576   reverse_words((julong *)n_ints, n, longwords);
3577 
3578   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3579     ::montgomery_square(a, n, m, (julong)inv, longwords);
3580   } else {
3581     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3582   }
3583 
3584   reverse_words(m, (julong *)m_ints, longwords);
3585 }
3586 
3587 #ifdef COMPILER2
3588 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3589 //
3590 //------------------------------generate_exception_blob---------------------------
3591 // creates exception blob at the end
3592 // Using exception blob, this code is jumped from a compiled method.
3593 // (see emit_exception_handler in x86_64.ad file)
3594 //
3595 // Given an exception pc at a call we call into the runtime for the
3596 // handler in this method. This handler might merely restore state
3597 // (i.e. callee save registers) unwind the frame and jump to the
3598 // exception handler for the nmethod if there is no Java level handler
3599 // for the nmethod.
3600 //
3601 // This code is entered with a jmp.
3602 //
3603 // Arguments:
3604 //   rax: exception oop
3605 //   rdx: exception pc
3606 //
3607 // Results:
3608 //   rax: exception oop
3609 //   rdx: exception pc in caller or ???
3610 //   destination: exception handler of caller
3611 //
3612 // Note: the exception pc MUST be at a call (precise debug information)
3613 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3614 //
3615 
3616 void OptoRuntime::generate_exception_blob() {
3617   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3618   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3619   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3620 
3621   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3622 
3623   // Allocate space for the code
3624   ResourceMark rm;
3625   // Setup code generation tools
3626   CodeBuffer buffer("exception_blob", 2048, 1024);
3627   MacroAssembler* masm = new MacroAssembler(&buffer);
3628 
3629 
3630   address start = __ pc();
3631 
3632   // Exception pc is 'return address' for stack walker
3633   __ push(rdx);
3634   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3635 
3636   // Save callee-saved registers.  See x86_64.ad.
3637 
3638   // rbp is an implicitly saved callee saved register (i.e., the calling
3639   // convention will save/restore it in the prolog/epilog). Other than that
3640   // there are no callee save registers now that adapter frames are gone.
3641 
3642   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3643 
3644   // Store exception in Thread object. We cannot pass any arguments to the
3645   // handle_exception call, since we do not want to make any assumption
3646   // about the size of the frame where the exception happened in.
3647   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3648   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3649   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3650 
3651   // This call does all the hard work.  It checks if an exception handler
3652   // exists in the method.
3653   // If so, it returns the handler address.
3654   // If not, it prepares for stack-unwinding, restoring the callee-save
3655   // registers of the frame being removed.
3656   //
3657   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3658 
3659   // At a method handle call, the stack may not be properly aligned
3660   // when returning with an exception.
3661   address the_pc = __ pc();
3662   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3663   __ mov(c_rarg0, r15_thread);
3664   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3665   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3666 
3667   // Set an oopmap for the call site.  This oopmap will only be used if we
3668   // are unwinding the stack.  Hence, all locations will be dead.
3669   // Callee-saved registers will be the same as the frame above (i.e.,
3670   // handle_exception_stub), since they were restored when we got the
3671   // exception.
3672 
3673   OopMapSet* oop_maps = new OopMapSet();
3674 
3675   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3676 
3677   __ reset_last_Java_frame(false);
3678 
3679   // Restore callee-saved registers
3680 
3681   // rbp is an implicitly saved callee-saved register (i.e., the calling
3682   // convention will save restore it in prolog/epilog) Other than that
3683   // there are no callee save registers now that adapter frames are gone.
3684 
3685   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3686 
3687   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3688   __ pop(rdx);                  // No need for exception pc anymore
3689 
3690   // rax: exception handler
3691 
3692   // We have a handler in rax (could be deopt blob).
3693   __ mov(r8, rax);
3694 
3695   // Get the exception oop
3696   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3697   // Get the exception pc in case we are deoptimized
3698   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3699 #ifdef ASSERT
3700   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3701   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3702 #endif
3703   // Clear the exception oop so GC no longer processes it as a root.
3704   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3705 
3706   // rax: exception oop
3707   // r8:  exception handler
3708   // rdx: exception pc
3709   // Jump to handler
3710 
3711   __ jmp(r8);
3712 
3713   // Make sure all code is generated
3714   masm->flush();
3715 
3716   // Set exception blob
3717   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3718 }
3719 #endif // COMPILER2
3720