1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/vframeArray.hpp"
  56 #include "runtime/vm_version.hpp"
  57 #include "utilities/align.hpp"
  58 #include "utilities/checkedCast.hpp"
  59 #include "utilities/formatBuffer.hpp"
  60 #include "vmreg_x86.inline.hpp"
  61 #ifdef COMPILER1
  62 #include "c1/c1_Runtime1.hpp"
  63 #endif
  64 #ifdef COMPILER2
  65 #include "opto/runtime.hpp"
  66 #endif
  67 #if INCLUDE_JVMCI
  68 #include "jvmci/jvmciJavaClasses.hpp"
  69 #endif
  70 
  71 #define __ masm->
  72 
  73 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  74 
  75 class SimpleRuntimeFrame {
  76 
  77   public:
  78 
  79   // Most of the runtime stubs have this simple frame layout.
  80   // This class exists to make the layout shared in one place.
  81   // Offsets are for compiler stack slots, which are jints.
  82   enum layout {
  83     // The frame sender code expects that rbp will be in the "natural" place and
  84     // will override any oopMap setting for it. We must therefore force the layout
  85     // so that it agrees with the frame sender code.
  86     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  87     rbp_off2,
  88     return_off, return_off2,
  89     framesize
  90   };
  91 };
  92 
  93 class RegisterSaver {
  94   // Capture info about frame layout.  Layout offsets are in jint
  95   // units because compiler frame slots are jints.
  96 #define XSAVE_AREA_BEGIN 160
  97 #define XSAVE_AREA_YMM_BEGIN 576
  98 #define XSAVE_AREA_OPMASK_BEGIN 1088
  99 #define XSAVE_AREA_ZMM_BEGIN 1152
 100 #define XSAVE_AREA_UPPERBANK 1664
 101 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 102 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 103 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 104 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 105 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 106   enum layout {
 107     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 108     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 109     DEF_XMM_OFFS(0),
 110     DEF_XMM_OFFS(1),
 111     // 2..15 are implied in range usage
 112     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 113     DEF_YMM_OFFS(0),
 114     DEF_YMM_OFFS(1),
 115     // 2..15 are implied in range usage
 116     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 117     DEF_OPMASK_OFFS(0),
 118     DEF_OPMASK_OFFS(1),
 119     // 2..7 are implied in range usage
 120     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 121     DEF_ZMM_OFFS(0),
 122     DEF_ZMM_OFFS(1),
 123     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 124     DEF_ZMM_UPPER_OFFS(16),
 125     DEF_ZMM_UPPER_OFFS(17),
 126     // 18..31 are implied in range usage
 127     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 128     fpu_stateH_end,
 129     r15_off, r15H_off,
 130     r14_off, r14H_off,
 131     r13_off, r13H_off,
 132     r12_off, r12H_off,
 133     r11_off, r11H_off,
 134     r10_off, r10H_off,
 135     r9_off,  r9H_off,
 136     r8_off,  r8H_off,
 137     rdi_off, rdiH_off,
 138     rsi_off, rsiH_off,
 139     ignore_off, ignoreH_off,  // extra copy of rbp
 140     rsp_off, rspH_off,
 141     rbx_off, rbxH_off,
 142     rdx_off, rdxH_off,
 143     rcx_off, rcxH_off,
 144     rax_off, raxH_off,
 145     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 146     align_off, alignH_off,
 147     flags_off, flagsH_off,
 148     // The frame sender code expects that rbp will be in the "natural" place and
 149     // will override any oopMap setting for it. We must therefore force the layout
 150     // so that it agrees with the frame sender code.
 151     rbp_off, rbpH_off,        // copy of rbp we will restore
 152     return_off, returnH_off,  // slot for return address
 153     reg_save_size             // size in compiler stack slots
 154   };
 155 
 156  public:
 157   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 158   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 159 
 160   // Offsets into the register save area
 161   // Used by deoptimization when it is managing result register
 162   // values on its own
 163 
 164   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 165   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 166   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 167   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 168   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 169 
 170   // During deoptimization only the result registers need to be restored,
 171   // all the other values have already been extracted.
 172   static void restore_result_registers(MacroAssembler* masm);
 173 };
 174 
 175 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 176   int off = 0;
 177   int num_xmm_regs = XMMRegister::available_xmm_registers();
 178 #if COMPILER2_OR_JVMCI
 179   if (save_wide_vectors && UseAVX == 0) {
 180     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 181   }
 182   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 183 #else
 184   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 185 #endif
 186 
 187   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 188   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 189   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 190   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 191   // CodeBlob frame size is in words.
 192   int frame_size_in_words = frame_size_in_bytes / wordSize;
 193   *total_frame_words = frame_size_in_words;
 194 
 195   // Save registers, fpu state, and flags.
 196   // We assume caller has already pushed the return address onto the
 197   // stack, so rsp is 8-byte aligned here.
 198   // We push rpb twice in this sequence because we want the real rbp
 199   // to be under the return like a normal enter.
 200 
 201   __ enter();          // rsp becomes 16-byte aligned here
 202   __ push_CPU_state(); // Push a multiple of 16 bytes
 203 
 204   // push cpu state handles this on EVEX enabled targets
 205   if (save_wide_vectors) {
 206     // Save upper half of YMM registers(0..15)
 207     int base_addr = XSAVE_AREA_YMM_BEGIN;
 208     for (int n = 0; n < 16; n++) {
 209       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 210     }
 211     if (VM_Version::supports_evex()) {
 212       // Save upper half of ZMM registers(0..15)
 213       base_addr = XSAVE_AREA_ZMM_BEGIN;
 214       for (int n = 0; n < 16; n++) {
 215         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 216       }
 217       // Save full ZMM registers(16..num_xmm_regs)
 218       base_addr = XSAVE_AREA_UPPERBANK;
 219       off = 0;
 220       int vector_len = Assembler::AVX_512bit;
 221       for (int n = 16; n < num_xmm_regs; n++) {
 222         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 223       }
 224 #if COMPILER2_OR_JVMCI
 225       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 226       off = 0;
 227       for(int n = 0; n < KRegister::number_of_registers; n++) {
 228         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 229       }
 230 #endif
 231     }
 232   } else {
 233     if (VM_Version::supports_evex()) {
 234       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 235       int base_addr = XSAVE_AREA_UPPERBANK;
 236       off = 0;
 237       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 238       for (int n = 16; n < num_xmm_regs; n++) {
 239         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 240       }
 241 #if COMPILER2_OR_JVMCI
 242       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 243       off = 0;
 244       for(int n = 0; n < KRegister::number_of_registers; n++) {
 245         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 246       }
 247 #endif
 248     }
 249   }
 250   __ vzeroupper();
 251   if (frame::arg_reg_save_area_bytes != 0) {
 252     // Allocate argument register save area
 253     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 254   }
 255 
 256   // Set an oopmap for the call site.  This oopmap will map all
 257   // oop-registers and debug-info registers as callee-saved.  This
 258   // will allow deoptimization at this safepoint to find all possible
 259   // debug-info recordings, as well as let GC find all oops.
 260 
 261   OopMapSet *oop_maps = new OopMapSet();
 262   OopMap* map = new OopMap(frame_size_in_slots, 0);
 263 
 264 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 265 
 266   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 267   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 270   // rbp location is known implicitly by the frame sender code, needs no oopmap
 271   // and the location where rbp was saved by is ignored
 272   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 273   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 282   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 283   // on EVEX enabled targets, we get it included in the xsave area
 284   off = xmm0_off;
 285   int delta = xmm1_off - off;
 286   for (int n = 0; n < 16; n++) {
 287     XMMRegister xmm_name = as_XMMRegister(n);
 288     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 289     off += delta;
 290   }
 291   if (UseAVX > 2) {
 292     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 293     off = zmm16_off;
 294     delta = zmm17_off - off;
 295     for (int n = 16; n < num_xmm_regs; n++) {
 296       XMMRegister zmm_name = as_XMMRegister(n);
 297       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 298       off += delta;
 299     }
 300   }
 301 
 302 #if COMPILER2_OR_JVMCI
 303   if (save_wide_vectors) {
 304     // Save upper half of YMM registers(0..15)
 305     off = ymm0_off;
 306     delta = ymm1_off - ymm0_off;
 307     for (int n = 0; n < 16; n++) {
 308       XMMRegister ymm_name = as_XMMRegister(n);
 309       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 310       off += delta;
 311     }
 312     if (VM_Version::supports_evex()) {
 313       // Save upper half of ZMM registers(0..15)
 314       off = zmm0_off;
 315       delta = zmm1_off - zmm0_off;
 316       for (int n = 0; n < 16; n++) {
 317         XMMRegister zmm_name = as_XMMRegister(n);
 318         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 319         off += delta;
 320       }
 321     }
 322   }
 323 #endif // COMPILER2_OR_JVMCI
 324 
 325   // %%% These should all be a waste but we'll keep things as they were for now
 326   if (true) {
 327     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 328     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 331     // rbp location is known implicitly by the frame sender code, needs no oopmap
 332     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 342     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 343     // on EVEX enabled targets, we get it included in the xsave area
 344     off = xmm0H_off;
 345     delta = xmm1H_off - off;
 346     for (int n = 0; n < 16; n++) {
 347       XMMRegister xmm_name = as_XMMRegister(n);
 348       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 349       off += delta;
 350     }
 351     if (UseAVX > 2) {
 352       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 353       off = zmm16H_off;
 354       delta = zmm17H_off - off;
 355       for (int n = 16; n < num_xmm_regs; n++) {
 356         XMMRegister zmm_name = as_XMMRegister(n);
 357         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 358         off += delta;
 359       }
 360     }
 361   }
 362 
 363   return map;
 364 }
 365 
 366 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 367   int num_xmm_regs = XMMRegister::available_xmm_registers();
 368   if (frame::arg_reg_save_area_bytes != 0) {
 369     // Pop arg register save area
 370     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 371   }
 372 
 373 #if COMPILER2_OR_JVMCI
 374   if (restore_wide_vectors) {
 375     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 376     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 377   }
 378 #else
 379   assert(!restore_wide_vectors, "vectors are generated only by C2");
 380 #endif
 381 
 382   __ vzeroupper();
 383 
 384   // On EVEX enabled targets everything is handled in pop fpu state
 385   if (restore_wide_vectors) {
 386     // Restore upper half of YMM registers (0..15)
 387     int base_addr = XSAVE_AREA_YMM_BEGIN;
 388     for (int n = 0; n < 16; n++) {
 389       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 390     }
 391     if (VM_Version::supports_evex()) {
 392       // Restore upper half of ZMM registers (0..15)
 393       base_addr = XSAVE_AREA_ZMM_BEGIN;
 394       for (int n = 0; n < 16; n++) {
 395         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 396       }
 397       // Restore full ZMM registers(16..num_xmm_regs)
 398       base_addr = XSAVE_AREA_UPPERBANK;
 399       int vector_len = Assembler::AVX_512bit;
 400       int off = 0;
 401       for (int n = 16; n < num_xmm_regs; n++) {
 402         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 403       }
 404 #if COMPILER2_OR_JVMCI
 405       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 406       off = 0;
 407       for (int n = 0; n < KRegister::number_of_registers; n++) {
 408         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 409       }
 410 #endif
 411     }
 412   } else {
 413     if (VM_Version::supports_evex()) {
 414       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 415       int base_addr = XSAVE_AREA_UPPERBANK;
 416       int off = 0;
 417       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 418       for (int n = 16; n < num_xmm_regs; n++) {
 419         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 420       }
 421 #if COMPILER2_OR_JVMCI
 422       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 423       off = 0;
 424       for (int n = 0; n < KRegister::number_of_registers; n++) {
 425         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 426       }
 427 #endif
 428     }
 429   }
 430 
 431   // Recover CPU state
 432   __ pop_CPU_state();
 433   // Get the rbp described implicitly by the calling convention (no oopMap)
 434   __ pop(rbp);
 435 }
 436 
 437 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 438 
 439   // Just restore result register. Only used by deoptimization. By
 440   // now any callee save register that needs to be restored to a c2
 441   // caller of the deoptee has been extracted into the vframeArray
 442   // and will be stuffed into the c2i adapter we create for later
 443   // restoration so only result registers need to be restored here.
 444 
 445   // Restore fp result register
 446   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 447   // Restore integer result register
 448   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 449   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 450 
 451   // Pop all of the register save are off the stack except the return address
 452   __ addptr(rsp, return_offset_in_bytes());
 453 }
 454 
 455 // Is vector's size (in bytes) bigger than a size saved by default?
 456 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 457 bool SharedRuntime::is_wide_vector(int size) {
 458   return size > 16;
 459 }
 460 
 461 // ---------------------------------------------------------------------------
 462 // Read the array of BasicTypes from a signature, and compute where the
 463 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 464 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 465 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 466 // as framesizes are fixed.
 467 // VMRegImpl::stack0 refers to the first slot 0(sp).
 468 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 469 // Register up to Register::number_of_registers are the 64-bit
 470 // integer registers.
 471 
 472 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 473 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 474 // units regardless of build. Of course for i486 there is no 64 bit build
 475 
 476 // The Java calling convention is a "shifted" version of the C ABI.
 477 // By skipping the first C ABI register we can call non-static jni methods
 478 // with small numbers of arguments without having to shuffle the arguments
 479 // at all. Since we control the java ABI we ought to at least get some
 480 // advantage out of it.
 481 
 482 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 483                                            VMRegPair *regs,
 484                                            int total_args_passed) {
 485 
 486   // Create the mapping between argument positions and
 487   // registers.
 488   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 489     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 490   };
 491   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 492     j_farg0, j_farg1, j_farg2, j_farg3,
 493     j_farg4, j_farg5, j_farg6, j_farg7
 494   };
 495 
 496 
 497   uint int_args = 0;
 498   uint fp_args = 0;
 499   uint stk_args = 0;
 500 
 501   for (int i = 0; i < total_args_passed; i++) {
 502     switch (sig_bt[i]) {
 503     case T_BOOLEAN:
 504     case T_CHAR:
 505     case T_BYTE:
 506     case T_SHORT:
 507     case T_INT:
 508       if (int_args < Argument::n_int_register_parameters_j) {
 509         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 510       } else {
 511         stk_args = align_up(stk_args, 2);
 512         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 513         stk_args += 1;
 514       }
 515       break;
 516     case T_VOID:
 517       // halves of T_LONG or T_DOUBLE
 518       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 519       regs[i].set_bad();
 520       break;
 521     case T_LONG:
 522       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 523       // fall through
 524     case T_OBJECT:
 525     case T_ARRAY:
 526     case T_ADDRESS:
 527       if (int_args < Argument::n_int_register_parameters_j) {
 528         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 529       } else {
 530         stk_args = align_up(stk_args, 2);
 531         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 532         stk_args += 2;
 533       }
 534       break;
 535     case T_FLOAT:
 536       if (fp_args < Argument::n_float_register_parameters_j) {
 537         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 538       } else {
 539         stk_args = align_up(stk_args, 2);
 540         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 541         stk_args += 1;
 542       }
 543       break;
 544     case T_DOUBLE:
 545       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 546       if (fp_args < Argument::n_float_register_parameters_j) {
 547         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 548       } else {
 549         stk_args = align_up(stk_args, 2);
 550         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 551         stk_args += 2;
 552       }
 553       break;
 554     default:
 555       ShouldNotReachHere();
 556       break;
 557     }
 558   }
 559 
 560   return stk_args;
 561 }
 562 
 563 // Patch the callers callsite with entry to compiled code if it exists.
 564 static void patch_callers_callsite(MacroAssembler *masm) {
 565   Label L;
 566   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 567   __ jcc(Assembler::equal, L);
 568 
 569   // Save the current stack pointer
 570   __ mov(r13, rsp);
 571   // Schedule the branch target address early.
 572   // Call into the VM to patch the caller, then jump to compiled callee
 573   // rax isn't live so capture return address while we easily can
 574   __ movptr(rax, Address(rsp, 0));
 575 
 576   // align stack so push_CPU_state doesn't fault
 577   __ andptr(rsp, -(StackAlignmentInBytes));
 578   __ push_CPU_state();
 579   __ vzeroupper();
 580   // VM needs caller's callsite
 581   // VM needs target method
 582   // This needs to be a long call since we will relocate this adapter to
 583   // the codeBuffer and it may not reach
 584 
 585   // Allocate argument register save area
 586   if (frame::arg_reg_save_area_bytes != 0) {
 587     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 588   }
 589   __ mov(c_rarg0, rbx);
 590   __ mov(c_rarg1, rax);
 591   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 592 
 593   // De-allocate argument register save area
 594   if (frame::arg_reg_save_area_bytes != 0) {
 595     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 596   }
 597 
 598   __ vzeroupper();
 599   __ pop_CPU_state();
 600   // restore sp
 601   __ mov(rsp, r13);
 602   __ bind(L);
 603 }
 604 
 605 
 606 static void gen_c2i_adapter(MacroAssembler *masm,
 607                             int total_args_passed,
 608                             int comp_args_on_stack,
 609                             const BasicType *sig_bt,
 610                             const VMRegPair *regs,
 611                             Label& skip_fixup) {
 612   // Before we get into the guts of the C2I adapter, see if we should be here
 613   // at all.  We've come from compiled code and are attempting to jump to the
 614   // interpreter, which means the caller made a static call to get here
 615   // (vcalls always get a compiled target if there is one).  Check for a
 616   // compiled target.  If there is one, we need to patch the caller's call.
 617   patch_callers_callsite(masm);
 618 
 619   __ bind(skip_fixup);
 620 
 621   // Since all args are passed on the stack, total_args_passed *
 622   // Interpreter::stackElementSize is the space we need.
 623 
 624   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 625 
 626   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 627 
 628   // stack is aligned, keep it that way
 629   // This is not currently needed or enforced by the interpreter, but
 630   // we might as well conform to the ABI.
 631   extraspace = align_up(extraspace, 2*wordSize);
 632 
 633   // set senderSP value
 634   __ lea(r13, Address(rsp, wordSize));
 635 
 636 #ifdef ASSERT
 637   __ check_stack_alignment(r13, "sender stack not aligned");
 638 #endif
 639   if (extraspace > 0) {
 640     // Pop the return address
 641     __ pop(rax);
 642 
 643     __ subptr(rsp, extraspace);
 644 
 645     // Push the return address
 646     __ push(rax);
 647 
 648     // Account for the return address location since we store it first rather
 649     // than hold it in a register across all the shuffling
 650     extraspace += wordSize;
 651   }
 652 
 653 #ifdef ASSERT
 654   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 655 #endif
 656 
 657   // Now write the args into the outgoing interpreter space
 658   for (int i = 0; i < total_args_passed; i++) {
 659     if (sig_bt[i] == T_VOID) {
 660       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 661       continue;
 662     }
 663 
 664     // offset to start parameters
 665     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 666     int next_off = st_off - Interpreter::stackElementSize;
 667 
 668     // Say 4 args:
 669     // i   st_off
 670     // 0   32 T_LONG
 671     // 1   24 T_VOID
 672     // 2   16 T_OBJECT
 673     // 3    8 T_BOOL
 674     // -    0 return address
 675     //
 676     // However to make thing extra confusing. Because we can fit a long/double in
 677     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 678     // leaves one slot empty and only stores to a single slot. In this case the
 679     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 680 
 681     VMReg r_1 = regs[i].first();
 682     VMReg r_2 = regs[i].second();
 683     if (!r_1->is_valid()) {
 684       assert(!r_2->is_valid(), "");
 685       continue;
 686     }
 687     if (r_1->is_stack()) {
 688       // memory to memory use rax
 689       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 690       if (!r_2->is_valid()) {
 691         // sign extend??
 692         __ movl(rax, Address(rsp, ld_off));
 693         __ movptr(Address(rsp, st_off), rax);
 694 
 695       } else {
 696 
 697         __ movq(rax, Address(rsp, ld_off));
 698 
 699         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 700         // T_DOUBLE and T_LONG use two slots in the interpreter
 701         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 702           // ld_off == LSW, ld_off+wordSize == MSW
 703           // st_off == MSW, next_off == LSW
 704           __ movq(Address(rsp, next_off), rax);
 705 #ifdef ASSERT
 706           // Overwrite the unused slot with known junk
 707           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 708           __ movptr(Address(rsp, st_off), rax);
 709 #endif /* ASSERT */
 710         } else {
 711           __ movq(Address(rsp, st_off), rax);
 712         }
 713       }
 714     } else if (r_1->is_Register()) {
 715       Register r = r_1->as_Register();
 716       if (!r_2->is_valid()) {
 717         // must be only an int (or less ) so move only 32bits to slot
 718         // why not sign extend??
 719         __ movl(Address(rsp, st_off), r);
 720       } else {
 721         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 722         // T_DOUBLE and T_LONG use two slots in the interpreter
 723         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 724           // long/double in gpr
 725 #ifdef ASSERT
 726           // Overwrite the unused slot with known junk
 727           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 728           __ movptr(Address(rsp, st_off), rax);
 729 #endif /* ASSERT */
 730           __ movq(Address(rsp, next_off), r);
 731         } else {
 732           __ movptr(Address(rsp, st_off), r);
 733         }
 734       }
 735     } else {
 736       assert(r_1->is_XMMRegister(), "");
 737       if (!r_2->is_valid()) {
 738         // only a float use just part of the slot
 739         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 740       } else {
 741 #ifdef ASSERT
 742         // Overwrite the unused slot with known junk
 743         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 744         __ movptr(Address(rsp, st_off), rax);
 745 #endif /* ASSERT */
 746         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 747       }
 748     }
 749   }
 750 
 751   // Schedule the branch target address early.
 752   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 753   __ jmp(rcx);
 754 }
 755 
 756 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 757                         address code_start, address code_end,
 758                         Label& L_ok) {
 759   Label L_fail;
 760   __ lea(temp_reg, ExternalAddress(code_start));
 761   __ cmpptr(pc_reg, temp_reg);
 762   __ jcc(Assembler::belowEqual, L_fail);
 763   __ lea(temp_reg, ExternalAddress(code_end));
 764   __ cmpptr(pc_reg, temp_reg);
 765   __ jcc(Assembler::below, L_ok);
 766   __ bind(L_fail);
 767 }
 768 
 769 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 770                                     int total_args_passed,
 771                                     int comp_args_on_stack,
 772                                     const BasicType *sig_bt,
 773                                     const VMRegPair *regs) {
 774 
 775   // Note: r13 contains the senderSP on entry. We must preserve it since
 776   // we may do a i2c -> c2i transition if we lose a race where compiled
 777   // code goes non-entrant while we get args ready.
 778   // In addition we use r13 to locate all the interpreter args as
 779   // we must align the stack to 16 bytes on an i2c entry else we
 780   // lose alignment we expect in all compiled code and register
 781   // save code can segv when fxsave instructions find improperly
 782   // aligned stack pointer.
 783 
 784   // Adapters can be frameless because they do not require the caller
 785   // to perform additional cleanup work, such as correcting the stack pointer.
 786   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 787   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 788   // even if a callee has modified the stack pointer.
 789   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 790   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 791   // up via the senderSP register).
 792   // In other words, if *either* the caller or callee is interpreted, we can
 793   // get the stack pointer repaired after a call.
 794   // This is why c2i and i2c adapters cannot be indefinitely composed.
 795   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 796   // both caller and callee would be compiled methods, and neither would
 797   // clean up the stack pointer changes performed by the two adapters.
 798   // If this happens, control eventually transfers back to the compiled
 799   // caller, but with an uncorrected stack, causing delayed havoc.
 800 
 801   if (VerifyAdapterCalls &&
 802       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
 803     // So, let's test for cascading c2i/i2c adapters right now.
 804     //  assert(Interpreter::contains($return_addr) ||
 805     //         StubRoutines::contains($return_addr),
 806     //         "i2c adapter must return to an interpreter frame");
 807     __ block_comment("verify_i2c { ");
 808     // Pick up the return address
 809     __ movptr(rax, Address(rsp, 0));
 810     Label L_ok;
 811     if (Interpreter::code() != nullptr) {
 812       range_check(masm, rax, r11,
 813                   Interpreter::code()->code_start(),
 814                   Interpreter::code()->code_end(),
 815                   L_ok);
 816     }
 817     if (StubRoutines::initial_stubs_code() != nullptr) {
 818       range_check(masm, rax, r11,
 819                   StubRoutines::initial_stubs_code()->code_begin(),
 820                   StubRoutines::initial_stubs_code()->code_end(),
 821                   L_ok);
 822     }
 823     if (StubRoutines::final_stubs_code() != nullptr) {
 824       range_check(masm, rax, r11,
 825                   StubRoutines::final_stubs_code()->code_begin(),
 826                   StubRoutines::final_stubs_code()->code_end(),
 827                   L_ok);
 828     }
 829     const char* msg = "i2c adapter must return to an interpreter frame";
 830     __ block_comment(msg);
 831     __ stop(msg);
 832     __ bind(L_ok);
 833     __ block_comment("} verify_i2ce ");
 834   }
 835 
 836   // Must preserve original SP for loading incoming arguments because
 837   // we need to align the outgoing SP for compiled code.
 838   __ movptr(r11, rsp);
 839 
 840   // Pick up the return address
 841   __ pop(rax);
 842 
 843   // Convert 4-byte c2 stack slots to words.
 844   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 845 
 846   if (comp_args_on_stack) {
 847     __ subptr(rsp, comp_words_on_stack * wordSize);
 848   }
 849 
 850   // Ensure compiled code always sees stack at proper alignment
 851   __ andptr(rsp, -16);
 852 
 853   // push the return address and misalign the stack that youngest frame always sees
 854   // as far as the placement of the call instruction
 855   __ push(rax);
 856 
 857   // Put saved SP in another register
 858   const Register saved_sp = rax;
 859   __ movptr(saved_sp, r11);
 860 
 861   // Will jump to the compiled code just as if compiled code was doing it.
 862   // Pre-load the register-jump target early, to schedule it better.
 863   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 864 
 865 #if INCLUDE_JVMCI
 866   if (EnableJVMCI) {
 867     // check if this call should be routed towards a specific entry point
 868     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 869     Label no_alternative_target;
 870     __ jcc(Assembler::equal, no_alternative_target);
 871     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 872     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 873     __ bind(no_alternative_target);
 874   }
 875 #endif // INCLUDE_JVMCI
 876 
 877   // Now generate the shuffle code.  Pick up all register args and move the
 878   // rest through the floating point stack top.
 879   for (int i = 0; i < total_args_passed; i++) {
 880     if (sig_bt[i] == T_VOID) {
 881       // Longs and doubles are passed in native word order, but misaligned
 882       // in the 32-bit build.
 883       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 884       continue;
 885     }
 886 
 887     // Pick up 0, 1 or 2 words from SP+offset.
 888 
 889     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 890             "scrambled load targets?");
 891     // Load in argument order going down.
 892     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 893     // Point to interpreter value (vs. tag)
 894     int next_off = ld_off - Interpreter::stackElementSize;
 895     //
 896     //
 897     //
 898     VMReg r_1 = regs[i].first();
 899     VMReg r_2 = regs[i].second();
 900     if (!r_1->is_valid()) {
 901       assert(!r_2->is_valid(), "");
 902       continue;
 903     }
 904     if (r_1->is_stack()) {
 905       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 906       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 907 
 908       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 909       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 910       // will be generated.
 911       if (!r_2->is_valid()) {
 912         // sign extend???
 913         __ movl(r13, Address(saved_sp, ld_off));
 914         __ movptr(Address(rsp, st_off), r13);
 915       } else {
 916         //
 917         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 918         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 919         // So we must adjust where to pick up the data to match the interpreter.
 920         //
 921         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 922         // are accessed as negative so LSW is at LOW address
 923 
 924         // ld_off is MSW so get LSW
 925         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 926                            next_off : ld_off;
 927         __ movq(r13, Address(saved_sp, offset));
 928         // st_off is LSW (i.e. reg.first())
 929         __ movq(Address(rsp, st_off), r13);
 930       }
 931     } else if (r_1->is_Register()) {  // Register argument
 932       Register r = r_1->as_Register();
 933       assert(r != rax, "must be different");
 934       if (r_2->is_valid()) {
 935         //
 936         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 937         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 938         // So we must adjust where to pick up the data to match the interpreter.
 939 
 940         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 941                            next_off : ld_off;
 942 
 943         // this can be a misaligned move
 944         __ movq(r, Address(saved_sp, offset));
 945       } else {
 946         // sign extend and use a full word?
 947         __ movl(r, Address(saved_sp, ld_off));
 948       }
 949     } else {
 950       if (!r_2->is_valid()) {
 951         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 952       } else {
 953         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 954       }
 955     }
 956   }
 957 
 958   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 959 
 960   // 6243940 We might end up in handle_wrong_method if
 961   // the callee is deoptimized as we race thru here. If that
 962   // happens we don't want to take a safepoint because the
 963   // caller frame will look interpreted and arguments are now
 964   // "compiled" so it is much better to make this transition
 965   // invisible to the stack walking code. Unfortunately if
 966   // we try and find the callee by normal means a safepoint
 967   // is possible. So we stash the desired callee in the thread
 968   // and the vm will find there should this case occur.
 969 
 970   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 971 
 972   // put Method* where a c2i would expect should we end up there
 973   // only needed because eof c2 resolve stubs return Method* as a result in
 974   // rax
 975   __ mov(rax, rbx);
 976   __ jmp(r11);
 977 }
 978 
 979 // ---------------------------------------------------------------
 980 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 981                                                             int total_args_passed,
 982                                                             int comp_args_on_stack,
 983                                                             const BasicType *sig_bt,
 984                                                             const VMRegPair *regs,
 985                                                             AdapterFingerPrint* fingerprint) {
 986   address i2c_entry = __ pc();
 987 
 988   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 989 
 990   // -------------------------------------------------------------------------
 991   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 992   // to the interpreter.  The args start out packed in the compiled layout.  They
 993   // need to be unpacked into the interpreter layout.  This will almost always
 994   // require some stack space.  We grow the current (compiled) stack, then repack
 995   // the args.  We  finally end in a jump to the generic interpreter entry point.
 996   // On exit from the interpreter, the interpreter will restore our SP (lest the
 997   // compiled code, which relies solely on SP and not RBP, get sick).
 998 
 999   address c2i_unverified_entry = __ pc();
1000   Label skip_fixup;
1001 
1002   Register data = rax;
1003   Register receiver = j_rarg0;
1004   Register temp = rbx;
1005 
1006   {
1007     __ ic_check(1 /* end_alignment */);
1008     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1009     // Method might have been compiled since the call site was patched to
1010     // interpreted if that is the case treat it as a miss so we can get
1011     // the call site corrected.
1012     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1013     __ jcc(Assembler::equal, skip_fixup);
1014     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1015   }
1016 
1017   address c2i_entry = __ pc();
1018 
1019   // Class initialization barrier for static methods
1020   address c2i_no_clinit_check_entry = nullptr;
1021   if (VM_Version::supports_fast_class_init_checks()) {
1022     Label L_skip_barrier;
1023     Register method = rbx;
1024 
1025     { // Bypass the barrier for non-static methods
1026       Register flags = rscratch1;
1027       __ movl(flags, Address(method, Method::access_flags_offset()));
1028       __ testl(flags, JVM_ACC_STATIC);
1029       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1030     }
1031 
1032     Register klass = rscratch1;
1033     __ load_method_holder(klass, method);
1034     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1035 
1036     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1037 
1038     __ bind(L_skip_barrier);
1039     c2i_no_clinit_check_entry = __ pc();
1040   }
1041 
1042   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1043   bs->c2i_entry_barrier(masm);
1044 
1045   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1046 
1047   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1048 }
1049 
1050 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1051                                          VMRegPair *regs,
1052                                          int total_args_passed) {
1053 
1054 // We return the amount of VMRegImpl stack slots we need to reserve for all
1055 // the arguments NOT counting out_preserve_stack_slots.
1056 
1057 // NOTE: These arrays will have to change when c1 is ported
1058 #ifdef _WIN64
1059     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1060       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1061     };
1062     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1063       c_farg0, c_farg1, c_farg2, c_farg3
1064     };
1065 #else
1066     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1067       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1068     };
1069     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1070       c_farg0, c_farg1, c_farg2, c_farg3,
1071       c_farg4, c_farg5, c_farg6, c_farg7
1072     };
1073 #endif // _WIN64
1074 
1075 
1076     uint int_args = 0;
1077     uint fp_args = 0;
1078     uint stk_args = 0; // inc by 2 each time
1079 
1080     for (int i = 0; i < total_args_passed; i++) {
1081       switch (sig_bt[i]) {
1082       case T_BOOLEAN:
1083       case T_CHAR:
1084       case T_BYTE:
1085       case T_SHORT:
1086       case T_INT:
1087         if (int_args < Argument::n_int_register_parameters_c) {
1088           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1089 #ifdef _WIN64
1090           fp_args++;
1091           // Allocate slots for callee to stuff register args the stack.
1092           stk_args += 2;
1093 #endif
1094         } else {
1095           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1096           stk_args += 2;
1097         }
1098         break;
1099       case T_LONG:
1100         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1101         // fall through
1102       case T_OBJECT:
1103       case T_ARRAY:
1104       case T_ADDRESS:
1105       case T_METADATA:
1106         if (int_args < Argument::n_int_register_parameters_c) {
1107           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1108 #ifdef _WIN64
1109           fp_args++;
1110           stk_args += 2;
1111 #endif
1112         } else {
1113           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1114           stk_args += 2;
1115         }
1116         break;
1117       case T_FLOAT:
1118         if (fp_args < Argument::n_float_register_parameters_c) {
1119           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1120 #ifdef _WIN64
1121           int_args++;
1122           // Allocate slots for callee to stuff register args the stack.
1123           stk_args += 2;
1124 #endif
1125         } else {
1126           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1127           stk_args += 2;
1128         }
1129         break;
1130       case T_DOUBLE:
1131         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1132         if (fp_args < Argument::n_float_register_parameters_c) {
1133           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1134 #ifdef _WIN64
1135           int_args++;
1136           // Allocate slots for callee to stuff register args the stack.
1137           stk_args += 2;
1138 #endif
1139         } else {
1140           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1141           stk_args += 2;
1142         }
1143         break;
1144       case T_VOID: // Halves of longs and doubles
1145         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1146         regs[i].set_bad();
1147         break;
1148       default:
1149         ShouldNotReachHere();
1150         break;
1151       }
1152     }
1153 #ifdef _WIN64
1154   // windows abi requires that we always allocate enough stack space
1155   // for 4 64bit registers to be stored down.
1156   if (stk_args < 8) {
1157     stk_args = 8;
1158   }
1159 #endif // _WIN64
1160 
1161   return stk_args;
1162 }
1163 
1164 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1165                                              uint num_bits,
1166                                              uint total_args_passed) {
1167   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1168          "only certain vector sizes are supported for now");
1169 
1170   static const XMMRegister VEC_ArgReg[32] = {
1171      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1172      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1173     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1174     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1175   };
1176 
1177   uint stk_args = 0;
1178   uint fp_args = 0;
1179 
1180   for (uint i = 0; i < total_args_passed; i++) {
1181     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1182     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1183     regs[i].set_pair(vmreg->next(next_val), vmreg);
1184   }
1185 
1186   return stk_args;
1187 }
1188 
1189 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1190   // We always ignore the frame_slots arg and just use the space just below frame pointer
1191   // which by this time is free to use
1192   switch (ret_type) {
1193   case T_FLOAT:
1194     __ movflt(Address(rbp, -wordSize), xmm0);
1195     break;
1196   case T_DOUBLE:
1197     __ movdbl(Address(rbp, -wordSize), xmm0);
1198     break;
1199   case T_VOID:  break;
1200   default: {
1201     __ movptr(Address(rbp, -wordSize), rax);
1202     }
1203   }
1204 }
1205 
1206 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1207   // We always ignore the frame_slots arg and just use the space just below frame pointer
1208   // which by this time is free to use
1209   switch (ret_type) {
1210   case T_FLOAT:
1211     __ movflt(xmm0, Address(rbp, -wordSize));
1212     break;
1213   case T_DOUBLE:
1214     __ movdbl(xmm0, Address(rbp, -wordSize));
1215     break;
1216   case T_VOID:  break;
1217   default: {
1218     __ movptr(rax, Address(rbp, -wordSize));
1219     }
1220   }
1221 }
1222 
1223 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1224     for ( int i = first_arg ; i < arg_count ; i++ ) {
1225       if (args[i].first()->is_Register()) {
1226         __ push(args[i].first()->as_Register());
1227       } else if (args[i].first()->is_XMMRegister()) {
1228         __ subptr(rsp, 2*wordSize);
1229         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1230       }
1231     }
1232 }
1233 
1234 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1235     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1236       if (args[i].first()->is_Register()) {
1237         __ pop(args[i].first()->as_Register());
1238       } else if (args[i].first()->is_XMMRegister()) {
1239         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1240         __ addptr(rsp, 2*wordSize);
1241       }
1242     }
1243 }
1244 
1245 static void verify_oop_args(MacroAssembler* masm,
1246                             const methodHandle& method,
1247                             const BasicType* sig_bt,
1248                             const VMRegPair* regs) {
1249   Register temp_reg = rbx;  // not part of any compiled calling seq
1250   if (VerifyOops) {
1251     for (int i = 0; i < method->size_of_parameters(); i++) {
1252       if (is_reference_type(sig_bt[i])) {
1253         VMReg r = regs[i].first();
1254         assert(r->is_valid(), "bad oop arg");
1255         if (r->is_stack()) {
1256           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1257           __ verify_oop(temp_reg);
1258         } else {
1259           __ verify_oop(r->as_Register());
1260         }
1261       }
1262     }
1263   }
1264 }
1265 
1266 static void check_continuation_enter_argument(VMReg actual_vmreg,
1267                                               Register expected_reg,
1268                                               const char* name) {
1269   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1270   assert(actual_vmreg->as_Register() == expected_reg,
1271          "%s is in unexpected register: %s instead of %s",
1272          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1273 }
1274 
1275 
1276 //---------------------------- continuation_enter_setup ---------------------------
1277 //
1278 // Arguments:
1279 //   None.
1280 //
1281 // Results:
1282 //   rsp: pointer to blank ContinuationEntry
1283 //
1284 // Kills:
1285 //   rax
1286 //
1287 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1288   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1289   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1290   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1291 
1292   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1293   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1294 
1295   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1296   OopMap* map = new OopMap(frame_size, 0);
1297 
1298   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1299   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1300   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1301 
1302   return map;
1303 }
1304 
1305 //---------------------------- fill_continuation_entry ---------------------------
1306 //
1307 // Arguments:
1308 //   rsp: pointer to blank Continuation entry
1309 //   reg_cont_obj: pointer to the continuation
1310 //   reg_flags: flags
1311 //
1312 // Results:
1313 //   rsp: pointer to filled out ContinuationEntry
1314 //
1315 // Kills:
1316 //   rax
1317 //
1318 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1319   assert_different_registers(rax, reg_cont_obj, reg_flags);
1320 #ifdef ASSERT
1321   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1322 #endif
1323   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1324   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1325   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1326   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1327   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1328 
1329   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1330   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1331   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1332   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1333 
1334   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1335   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1336 }
1337 
1338 //---------------------------- continuation_enter_cleanup ---------------------------
1339 //
1340 // Arguments:
1341 //   rsp: pointer to the ContinuationEntry
1342 //
1343 // Results:
1344 //   rsp: pointer to the spilled rbp in the entry frame
1345 //
1346 // Kills:
1347 //   rbx
1348 //
1349 void static continuation_enter_cleanup(MacroAssembler* masm) {
1350 #ifdef ASSERT
1351   Label L_good_sp;
1352   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1353   __ jcc(Assembler::equal, L_good_sp);
1354   __ stop("Incorrect rsp at continuation_enter_cleanup");
1355   __ bind(L_good_sp);
1356 #endif
1357 
1358   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1359   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1360   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1361   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1362 
1363   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1364   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1365   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1366 }
1367 
1368 static void gen_continuation_enter(MacroAssembler* masm,
1369                                    const VMRegPair* regs,
1370                                    int& exception_offset,
1371                                    OopMapSet* oop_maps,
1372                                    int& frame_complete,
1373                                    int& stack_slots,
1374                                    int& interpreted_entry_offset,
1375                                    int& compiled_entry_offset) {
1376 
1377   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1378   int pos_cont_obj   = 0;
1379   int pos_is_cont    = 1;
1380   int pos_is_virtual = 2;
1381 
1382   // The platform-specific calling convention may present the arguments in various registers.
1383   // To simplify the rest of the code, we expect the arguments to reside at these known
1384   // registers, and we additionally check the placement here in case calling convention ever
1385   // changes.
1386   Register reg_cont_obj   = c_rarg1;
1387   Register reg_is_cont    = c_rarg2;
1388   Register reg_is_virtual = c_rarg3;
1389 
1390   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1391   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1392   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1393 
1394   // Utility methods kill rax, make sure there are no collisions
1395   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1396 
1397   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1398                          relocInfo::static_call_type);
1399 
1400   address start = __ pc();
1401 
1402   Label L_thaw, L_exit;
1403 
1404   // i2i entry used at interp_only_mode only
1405   interpreted_entry_offset = __ pc() - start;
1406   {
1407 #ifdef ASSERT
1408     Label is_interp_only;
1409     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1410     __ jcc(Assembler::notEqual, is_interp_only);
1411     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1412     __ bind(is_interp_only);
1413 #endif
1414 
1415     __ pop(rax); // return address
1416     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1417     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1418     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1419     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1420     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1421     __ push(rax); // return address
1422     __ push_cont_fastpath();
1423 
1424     __ enter();
1425 
1426     stack_slots = 2; // will be adjusted in setup
1427     OopMap* map = continuation_enter_setup(masm, stack_slots);
1428     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1429     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1430 
1431     __ verify_oop(reg_cont_obj);
1432 
1433     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1434 
1435     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1436     __ testptr(reg_is_cont, reg_is_cont);
1437     __ jcc(Assembler::notZero, L_thaw);
1438 
1439     // --- Resolve path
1440 
1441     // Make sure the call is patchable
1442     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1443     // Emit stub for static call
1444     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1445     if (stub == nullptr) {
1446       fatal("CodeCache is full at gen_continuation_enter");
1447     }
1448     __ call(resolve);
1449     oop_maps->add_gc_map(__ pc() - start, map);
1450     __ post_call_nop();
1451 
1452     __ jmp(L_exit);
1453   }
1454 
1455   // compiled entry
1456   __ align(CodeEntryAlignment);
1457   compiled_entry_offset = __ pc() - start;
1458   __ enter();
1459 
1460   stack_slots = 2; // will be adjusted in setup
1461   OopMap* map = continuation_enter_setup(masm, stack_slots);
1462 
1463   // Frame is now completed as far as size and linkage.
1464   frame_complete = __ pc() - start;
1465 
1466   __ verify_oop(reg_cont_obj);
1467 
1468   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1469 
1470   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1471   __ testptr(reg_is_cont, reg_is_cont);
1472   __ jccb(Assembler::notZero, L_thaw);
1473 
1474   // --- call Continuation.enter(Continuation c, boolean isContinue)
1475 
1476   // Make sure the call is patchable
1477   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1478 
1479   // Emit stub for static call
1480   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1481   if (stub == nullptr) {
1482     fatal("CodeCache is full at gen_continuation_enter");
1483   }
1484 
1485   // The call needs to be resolved. There's a special case for this in
1486   // SharedRuntime::find_callee_info_helper() which calls
1487   // LinkResolver::resolve_continuation_enter() which resolves the call to
1488   // Continuation.enter(Continuation c, boolean isContinue).
1489   __ call(resolve);
1490 
1491   oop_maps->add_gc_map(__ pc() - start, map);
1492   __ post_call_nop();
1493 
1494   __ jmpb(L_exit);
1495 
1496   // --- Thawing path
1497 
1498   __ bind(L_thaw);
1499 
1500   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1501 
1502   ContinuationEntry::_return_pc_offset = __ pc() - start;
1503   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1504   __ post_call_nop();
1505 
1506   // --- Normal exit (resolve/thawing)
1507 
1508   __ bind(L_exit);
1509 
1510   continuation_enter_cleanup(masm);
1511   __ pop(rbp);
1512   __ ret(0);
1513 
1514   // --- Exception handling path
1515 
1516   exception_offset = __ pc() - start;
1517 
1518   continuation_enter_cleanup(masm);
1519   __ pop(rbp);
1520 
1521   __ movptr(c_rarg0, r15_thread);
1522   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1523 
1524   // rax still holds the original exception oop, save it before the call
1525   __ push(rax);
1526 
1527   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1528   __ movptr(rbx, rax);
1529 
1530   // Continue at exception handler:
1531   //   rax: exception oop
1532   //   rbx: exception handler
1533   //   rdx: exception pc
1534   __ pop(rax);
1535   __ verify_oop(rax);
1536   __ pop(rdx);
1537   __ jmp(rbx);
1538 }
1539 
1540 static void gen_continuation_yield(MacroAssembler* masm,
1541                                    const VMRegPair* regs,
1542                                    OopMapSet* oop_maps,
1543                                    int& frame_complete,
1544                                    int& stack_slots,
1545                                    int& compiled_entry_offset) {
1546   enum layout {
1547     rbp_off,
1548     rbpH_off,
1549     return_off,
1550     return_off2,
1551     framesize // inclusive of return address
1552   };
1553   stack_slots = framesize /  VMRegImpl::slots_per_word;
1554   assert(stack_slots == 2, "recheck layout");
1555 
1556   address start = __ pc();
1557   compiled_entry_offset = __ pc() - start;
1558   __ enter();
1559   address the_pc = __ pc();
1560 
1561   frame_complete = the_pc - start;
1562 
1563   // This nop must be exactly at the PC we push into the frame info.
1564   // We use this nop for fast CodeBlob lookup, associate the OopMap
1565   // with it right away.
1566   __ post_call_nop();
1567   OopMap* map = new OopMap(framesize, 1);
1568   oop_maps->add_gc_map(frame_complete, map);
1569 
1570   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1571   __ movptr(c_rarg0, r15_thread);
1572   __ movptr(c_rarg1, rsp);
1573   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1574   __ reset_last_Java_frame(true);
1575 
1576   Label L_pinned;
1577 
1578   __ testptr(rax, rax);
1579   __ jcc(Assembler::notZero, L_pinned);
1580 
1581   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1582   continuation_enter_cleanup(masm);
1583   __ pop(rbp);
1584   __ ret(0);
1585 
1586   __ bind(L_pinned);
1587 
1588   // Pinned, return to caller
1589 
1590   // handle pending exception thrown by freeze
1591   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1592   Label ok;
1593   __ jcc(Assembler::equal, ok);
1594   __ leave();
1595   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1596   __ bind(ok);
1597 
1598   __ leave();
1599   __ ret(0);
1600 }
1601 
1602 static void gen_special_dispatch(MacroAssembler* masm,
1603                                  const methodHandle& method,
1604                                  const BasicType* sig_bt,
1605                                  const VMRegPair* regs) {
1606   verify_oop_args(masm, method, sig_bt, regs);
1607   vmIntrinsics::ID iid = method->intrinsic_id();
1608 
1609   // Now write the args into the outgoing interpreter space
1610   bool     has_receiver   = false;
1611   Register receiver_reg   = noreg;
1612   int      member_arg_pos = -1;
1613   Register member_reg     = noreg;
1614   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1615   if (ref_kind != 0) {
1616     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1617     member_reg = rbx;  // known to be free at this point
1618     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1619   } else if (iid == vmIntrinsics::_invokeBasic) {
1620     has_receiver = true;
1621   } else if (iid == vmIntrinsics::_linkToNative) {
1622     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1623     member_reg = rbx;  // known to be free at this point
1624   } else {
1625     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1626   }
1627 
1628   if (member_reg != noreg) {
1629     // Load the member_arg into register, if necessary.
1630     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1631     VMReg r = regs[member_arg_pos].first();
1632     if (r->is_stack()) {
1633       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1634     } else {
1635       // no data motion is needed
1636       member_reg = r->as_Register();
1637     }
1638   }
1639 
1640   if (has_receiver) {
1641     // Make sure the receiver is loaded into a register.
1642     assert(method->size_of_parameters() > 0, "oob");
1643     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1644     VMReg r = regs[0].first();
1645     assert(r->is_valid(), "bad receiver arg");
1646     if (r->is_stack()) {
1647       // Porting note:  This assumes that compiled calling conventions always
1648       // pass the receiver oop in a register.  If this is not true on some
1649       // platform, pick a temp and load the receiver from stack.
1650       fatal("receiver always in a register");
1651       receiver_reg = j_rarg0;  // known to be free at this point
1652       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1653     } else {
1654       // no data motion is needed
1655       receiver_reg = r->as_Register();
1656     }
1657   }
1658 
1659   // Figure out which address we are really jumping to:
1660   MethodHandles::generate_method_handle_dispatch(masm, iid,
1661                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1662 }
1663 
1664 // ---------------------------------------------------------------------------
1665 // Generate a native wrapper for a given method.  The method takes arguments
1666 // in the Java compiled code convention, marshals them to the native
1667 // convention (handlizes oops, etc), transitions to native, makes the call,
1668 // returns to java state (possibly blocking), unhandlizes any result and
1669 // returns.
1670 //
1671 // Critical native functions are a shorthand for the use of
1672 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1673 // functions.  The wrapper is expected to unpack the arguments before
1674 // passing them to the callee. Critical native functions leave the state _in_Java,
1675 // since they cannot stop for GC.
1676 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1677 // block and the check for pending exceptions it's impossible for them
1678 // to be thrown.
1679 //
1680 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1681                                                 const methodHandle& method,
1682                                                 int compile_id,
1683                                                 BasicType* in_sig_bt,
1684                                                 VMRegPair* in_regs,
1685                                                 BasicType ret_type) {
1686   if (method->is_continuation_native_intrinsic()) {
1687     int exception_offset = -1;
1688     OopMapSet* oop_maps = new OopMapSet();
1689     int frame_complete = -1;
1690     int stack_slots = -1;
1691     int interpreted_entry_offset = -1;
1692     int vep_offset = -1;
1693     if (method->is_continuation_enter_intrinsic()) {
1694       gen_continuation_enter(masm,
1695                              in_regs,
1696                              exception_offset,
1697                              oop_maps,
1698                              frame_complete,
1699                              stack_slots,
1700                              interpreted_entry_offset,
1701                              vep_offset);
1702     } else if (method->is_continuation_yield_intrinsic()) {
1703       gen_continuation_yield(masm,
1704                              in_regs,
1705                              oop_maps,
1706                              frame_complete,
1707                              stack_slots,
1708                              vep_offset);
1709     } else {
1710       guarantee(false, "Unknown Continuation native intrinsic");
1711     }
1712 
1713 #ifdef ASSERT
1714     if (method->is_continuation_enter_intrinsic()) {
1715       assert(interpreted_entry_offset != -1, "Must be set");
1716       assert(exception_offset != -1,         "Must be set");
1717     } else {
1718       assert(interpreted_entry_offset == -1, "Must be unset");
1719       assert(exception_offset == -1,         "Must be unset");
1720     }
1721     assert(frame_complete != -1,    "Must be set");
1722     assert(stack_slots != -1,       "Must be set");
1723     assert(vep_offset != -1,        "Must be set");
1724 #endif
1725 
1726     __ flush();
1727     nmethod* nm = nmethod::new_native_nmethod(method,
1728                                               compile_id,
1729                                               masm->code(),
1730                                               vep_offset,
1731                                               frame_complete,
1732                                               stack_slots,
1733                                               in_ByteSize(-1),
1734                                               in_ByteSize(-1),
1735                                               oop_maps,
1736                                               exception_offset);
1737     if (nm == nullptr) return nm;
1738     if (method->is_continuation_enter_intrinsic()) {
1739       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1740     } else if (method->is_continuation_yield_intrinsic()) {
1741       _cont_doYield_stub = nm;
1742     }
1743     return nm;
1744   }
1745 
1746   if (method->is_method_handle_intrinsic()) {
1747     vmIntrinsics::ID iid = method->intrinsic_id();
1748     intptr_t start = (intptr_t)__ pc();
1749     int vep_offset = ((intptr_t)__ pc()) - start;
1750     gen_special_dispatch(masm,
1751                          method,
1752                          in_sig_bt,
1753                          in_regs);
1754     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1755     __ flush();
1756     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1757     return nmethod::new_native_nmethod(method,
1758                                        compile_id,
1759                                        masm->code(),
1760                                        vep_offset,
1761                                        frame_complete,
1762                                        stack_slots / VMRegImpl::slots_per_word,
1763                                        in_ByteSize(-1),
1764                                        in_ByteSize(-1),
1765                                        nullptr);
1766   }
1767   address native_func = method->native_function();
1768   assert(native_func != nullptr, "must have function");
1769 
1770   // An OopMap for lock (and class if static)
1771   OopMapSet *oop_maps = new OopMapSet();
1772   intptr_t start = (intptr_t)__ pc();
1773 
1774   // We have received a description of where all the java arg are located
1775   // on entry to the wrapper. We need to convert these args to where
1776   // the jni function will expect them. To figure out where they go
1777   // we convert the java signature to a C signature by inserting
1778   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1779 
1780   const int total_in_args = method->size_of_parameters();
1781   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1782 
1783   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1784   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1785   BasicType* in_elem_bt = nullptr;
1786 
1787   int argc = 0;
1788   out_sig_bt[argc++] = T_ADDRESS;
1789   if (method->is_static()) {
1790     out_sig_bt[argc++] = T_OBJECT;
1791   }
1792 
1793   for (int i = 0; i < total_in_args ; i++ ) {
1794     out_sig_bt[argc++] = in_sig_bt[i];
1795   }
1796 
1797   // Now figure out where the args must be stored and how much stack space
1798   // they require.
1799   int out_arg_slots;
1800   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1801 
1802   // Compute framesize for the wrapper.  We need to handlize all oops in
1803   // incoming registers
1804 
1805   // Calculate the total number of stack slots we will need.
1806 
1807   // First count the abi requirement plus all of the outgoing args
1808   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1809 
1810   // Now the space for the inbound oop handle area
1811   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1812 
1813   int oop_handle_offset = stack_slots;
1814   stack_slots += total_save_slots;
1815 
1816   // Now any space we need for handlizing a klass if static method
1817 
1818   int klass_slot_offset = 0;
1819   int klass_offset = -1;
1820   int lock_slot_offset = 0;
1821   bool is_static = false;
1822 
1823   if (method->is_static()) {
1824     klass_slot_offset = stack_slots;
1825     stack_slots += VMRegImpl::slots_per_word;
1826     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1827     is_static = true;
1828   }
1829 
1830   // Plus a lock if needed
1831 
1832   if (method->is_synchronized()) {
1833     lock_slot_offset = stack_slots;
1834     stack_slots += VMRegImpl::slots_per_word;
1835   }
1836 
1837   // Now a place (+2) to save return values or temp during shuffling
1838   // + 4 for return address (which we own) and saved rbp
1839   stack_slots += 6;
1840 
1841   // Ok The space we have allocated will look like:
1842   //
1843   //
1844   // FP-> |                     |
1845   //      |---------------------|
1846   //      | 2 slots for moves   |
1847   //      |---------------------|
1848   //      | lock box (if sync)  |
1849   //      |---------------------| <- lock_slot_offset
1850   //      | klass (if static)   |
1851   //      |---------------------| <- klass_slot_offset
1852   //      | oopHandle area      |
1853   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1854   //      | outbound memory     |
1855   //      | based arguments     |
1856   //      |                     |
1857   //      |---------------------|
1858   //      |                     |
1859   // SP-> | out_preserved_slots |
1860   //
1861   //
1862 
1863 
1864   // Now compute actual number of stack words we need rounding to make
1865   // stack properly aligned.
1866   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1867 
1868   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1869 
1870   // First thing make an ic check to see if we should even be here
1871 
1872   // We are free to use all registers as temps without saving them and
1873   // restoring them except rbp. rbp is the only callee save register
1874   // as far as the interpreter and the compiler(s) are concerned.
1875 
1876   const Register receiver = j_rarg0;
1877 
1878   Label exception_pending;
1879 
1880   assert_different_registers(receiver, rscratch1, rscratch2);
1881   __ verify_oop(receiver);
1882   __ ic_check(8 /* end_alignment */);
1883 
1884   int vep_offset = ((intptr_t)__ pc()) - start;
1885 
1886   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1887     Label L_skip_barrier;
1888     Register klass = r10;
1889     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1890     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1891 
1892     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1893 
1894     __ bind(L_skip_barrier);
1895   }
1896 
1897 #ifdef COMPILER1
1898   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1899   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1900     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1901   }
1902 #endif // COMPILER1
1903 
1904   // The instruction at the verified entry point must be 5 bytes or longer
1905   // because it can be patched on the fly by make_non_entrant. The stack bang
1906   // instruction fits that requirement.
1907 
1908   // Generate stack overflow check
1909   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1910 
1911   // Generate a new frame for the wrapper.
1912   __ enter();
1913   // -2 because return address is already present and so is saved rbp
1914   __ subptr(rsp, stack_size - 2*wordSize);
1915 
1916   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1917   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1918   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1919 
1920   // Frame is now completed as far as size and linkage.
1921   int frame_complete = ((intptr_t)__ pc()) - start;
1922 
1923     if (UseRTMLocking) {
1924       // Abort RTM transaction before calling JNI
1925       // because critical section will be large and will be
1926       // aborted anyway. Also nmethod could be deoptimized.
1927       __ xabort(0);
1928     }
1929 
1930 #ifdef ASSERT
1931   __ check_stack_alignment(rsp, "improperly aligned stack");
1932 #endif /* ASSERT */
1933 
1934 
1935   // We use r14 as the oop handle for the receiver/klass
1936   // It is callee save so it survives the call to native
1937 
1938   const Register oop_handle_reg = r14;
1939 
1940   //
1941   // We immediately shuffle the arguments so that any vm call we have to
1942   // make from here on out (sync slow path, jvmti, etc.) we will have
1943   // captured the oops from our caller and have a valid oopMap for
1944   // them.
1945 
1946   // -----------------
1947   // The Grand Shuffle
1948 
1949   // The Java calling convention is either equal (linux) or denser (win64) than the
1950   // c calling convention. However the because of the jni_env argument the c calling
1951   // convention always has at least one more (and two for static) arguments than Java.
1952   // Therefore if we move the args from java -> c backwards then we will never have
1953   // a register->register conflict and we don't have to build a dependency graph
1954   // and figure out how to break any cycles.
1955   //
1956 
1957   // Record esp-based slot for receiver on stack for non-static methods
1958   int receiver_offset = -1;
1959 
1960   // This is a trick. We double the stack slots so we can claim
1961   // the oops in the caller's frame. Since we are sure to have
1962   // more args than the caller doubling is enough to make
1963   // sure we can capture all the incoming oop args from the
1964   // caller.
1965   //
1966   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1967 
1968   // Mark location of rbp (someday)
1969   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1970 
1971   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1972   // All inbound args are referenced based on rbp and all outbound args via rsp.
1973 
1974 
1975 #ifdef ASSERT
1976   bool reg_destroyed[Register::number_of_registers];
1977   bool freg_destroyed[XMMRegister::number_of_registers];
1978   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1979     reg_destroyed[r] = false;
1980   }
1981   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1982     freg_destroyed[f] = false;
1983   }
1984 
1985 #endif /* ASSERT */
1986 
1987   // For JNI natives the incoming and outgoing registers are offset upwards.
1988   GrowableArray<int> arg_order(2 * total_in_args);
1989 
1990   VMRegPair tmp_vmreg;
1991   tmp_vmreg.set2(rbx->as_VMReg());
1992 
1993   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1994     arg_order.push(i);
1995     arg_order.push(c_arg);
1996   }
1997 
1998   int temploc = -1;
1999   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2000     int i = arg_order.at(ai);
2001     int c_arg = arg_order.at(ai + 1);
2002     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2003 #ifdef ASSERT
2004     if (in_regs[i].first()->is_Register()) {
2005       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2006     } else if (in_regs[i].first()->is_XMMRegister()) {
2007       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2008     }
2009     if (out_regs[c_arg].first()->is_Register()) {
2010       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2011     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2012       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2013     }
2014 #endif /* ASSERT */
2015     switch (in_sig_bt[i]) {
2016       case T_ARRAY:
2017       case T_OBJECT:
2018         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2019                     ((i == 0) && (!is_static)),
2020                     &receiver_offset);
2021         break;
2022       case T_VOID:
2023         break;
2024 
2025       case T_FLOAT:
2026         __ float_move(in_regs[i], out_regs[c_arg]);
2027           break;
2028 
2029       case T_DOUBLE:
2030         assert( i + 1 < total_in_args &&
2031                 in_sig_bt[i + 1] == T_VOID &&
2032                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2033         __ double_move(in_regs[i], out_regs[c_arg]);
2034         break;
2035 
2036       case T_LONG :
2037         __ long_move(in_regs[i], out_regs[c_arg]);
2038         break;
2039 
2040       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2041 
2042       default:
2043         __ move32_64(in_regs[i], out_regs[c_arg]);
2044     }
2045   }
2046 
2047   int c_arg;
2048 
2049   // Pre-load a static method's oop into r14.  Used both by locking code and
2050   // the normal JNI call code.
2051   // point c_arg at the first arg that is already loaded in case we
2052   // need to spill before we call out
2053   c_arg = total_c_args - total_in_args;
2054 
2055   if (method->is_static()) {
2056 
2057     //  load oop into a register
2058     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2059 
2060     // Now handlize the static class mirror it's known not-null.
2061     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2062     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2063 
2064     // Now get the handle
2065     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2066     // store the klass handle as second argument
2067     __ movptr(c_rarg1, oop_handle_reg);
2068     // and protect the arg if we must spill
2069     c_arg--;
2070   }
2071 
2072   // Change state to native (we save the return address in the thread, since it might not
2073   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2074   // points into the right code segment. It does not have to be the correct return pc.
2075   // We use the same pc/oopMap repeatedly when we call out
2076 
2077   intptr_t the_pc = (intptr_t) __ pc();
2078   oop_maps->add_gc_map(the_pc - start, map);
2079 
2080   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2081 
2082 
2083   // We have all of the arguments setup at this point. We must not touch any register
2084   // argument registers at this point (what if we save/restore them there are no oop?
2085 
2086   {
2087     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2088     // protect the args we've loaded
2089     save_args(masm, total_c_args, c_arg, out_regs);
2090     __ mov_metadata(c_rarg1, method());
2091     __ call_VM_leaf(
2092       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2093       r15_thread, c_rarg1);
2094     restore_args(masm, total_c_args, c_arg, out_regs);
2095   }
2096 
2097   // RedefineClasses() tracing support for obsolete method entry
2098   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2099     // protect the args we've loaded
2100     save_args(masm, total_c_args, c_arg, out_regs);
2101     __ mov_metadata(c_rarg1, method());
2102     __ call_VM_leaf(
2103       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2104       r15_thread, c_rarg1);
2105     restore_args(masm, total_c_args, c_arg, out_regs);
2106   }
2107 
2108   // Lock a synchronized method
2109 
2110   // Register definitions used by locking and unlocking
2111 
2112   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2113   const Register obj_reg  = rbx;  // Will contain the oop
2114   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2115   const Register old_hdr  = r13;  // value of old header at unlock time
2116 
2117   Label slow_path_lock;
2118   Label lock_done;
2119 
2120   if (method->is_synchronized()) {
2121     Label count_mon;
2122 
2123     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2124 
2125     // Get the handle (the 2nd argument)
2126     __ mov(oop_handle_reg, c_rarg1);
2127 
2128     // Get address of the box
2129 
2130     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2131 
2132     // Load the oop from the handle
2133     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2134 
2135     if (LockingMode == LM_MONITOR) {
2136       __ jmp(slow_path_lock);
2137     } else if (LockingMode == LM_LEGACY) {
2138       // Load immediate 1 into swap_reg %rax
2139       __ movl(swap_reg, 1);
2140 
2141       // Load (object->mark() | 1) into swap_reg %rax
2142       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2143 
2144       // Save (object->mark() | 1) into BasicLock's displaced header
2145       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2146 
2147       // src -> dest iff dest == rax else rax <- dest
2148       __ lock();
2149       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2150       __ jcc(Assembler::equal, count_mon);
2151 
2152       // Hmm should this move to the slow path code area???
2153 
2154       // Test if the oopMark is an obvious stack pointer, i.e.,
2155       //  1) (mark & 3) == 0, and
2156       //  2) rsp <= mark < mark + os::pagesize()
2157       // These 3 tests can be done by evaluating the following
2158       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2159       // assuming both stack pointer and pagesize have their
2160       // least significant 2 bits clear.
2161       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2162 
2163       __ subptr(swap_reg, rsp);
2164       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2165 
2166       // Save the test result, for recursive case, the result is zero
2167       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2168       __ jcc(Assembler::notEqual, slow_path_lock);
2169     } else {
2170       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2171       __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2172     }
2173     __ bind(count_mon);
2174     __ inc_held_monitor_count();
2175 
2176     // Slow path will re-enter here
2177     __ bind(lock_done);
2178   }
2179 
2180   // Finally just about ready to make the JNI call
2181 
2182   // get JNIEnv* which is first argument to native
2183   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2184 
2185   // Now set thread in native
2186   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2187 
2188   __ call(RuntimeAddress(native_func));
2189 
2190   // Verify or restore cpu control state after JNI call
2191   __ restore_cpu_control_state_after_jni(rscratch1);
2192 
2193   // Unpack native results.
2194   switch (ret_type) {
2195   case T_BOOLEAN: __ c2bool(rax);            break;
2196   case T_CHAR   : __ movzwl(rax, rax);      break;
2197   case T_BYTE   : __ sign_extend_byte (rax); break;
2198   case T_SHORT  : __ sign_extend_short(rax); break;
2199   case T_INT    : /* nothing to do */        break;
2200   case T_DOUBLE :
2201   case T_FLOAT  :
2202     // Result is in xmm0 we'll save as needed
2203     break;
2204   case T_ARRAY:                 // Really a handle
2205   case T_OBJECT:                // Really a handle
2206       break; // can't de-handlize until after safepoint check
2207   case T_VOID: break;
2208   case T_LONG: break;
2209   default       : ShouldNotReachHere();
2210   }
2211 
2212   Label after_transition;
2213 
2214   // Switch thread to "native transition" state before reading the synchronization state.
2215   // This additional state is necessary because reading and testing the synchronization
2216   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2217   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2218   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2219   //     Thread A is resumed to finish this native method, but doesn't block here since it
2220   //     didn't see any synchronization is progress, and escapes.
2221   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2222 
2223   // Force this write out before the read below
2224   if (!UseSystemMemoryBarrier) {
2225     __ membar(Assembler::Membar_mask_bits(
2226               Assembler::LoadLoad | Assembler::LoadStore |
2227               Assembler::StoreLoad | Assembler::StoreStore));
2228   }
2229 
2230   // check for safepoint operation in progress and/or pending suspend requests
2231   {
2232     Label Continue;
2233     Label slow_path;
2234 
2235     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2236 
2237     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2238     __ jcc(Assembler::equal, Continue);
2239     __ bind(slow_path);
2240 
2241     // Don't use call_VM as it will see a possible pending exception and forward it
2242     // and never return here preventing us from clearing _last_native_pc down below.
2243     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2244     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2245     // by hand.
2246     //
2247     __ vzeroupper();
2248     save_native_result(masm, ret_type, stack_slots);
2249     __ mov(c_rarg0, r15_thread);
2250     __ mov(r12, rsp); // remember sp
2251     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2252     __ andptr(rsp, -16); // align stack as required by ABI
2253     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2254     __ mov(rsp, r12); // restore sp
2255     __ reinit_heapbase();
2256     // Restore any method result value
2257     restore_native_result(masm, ret_type, stack_slots);
2258     __ bind(Continue);
2259   }
2260 
2261   // change thread state
2262   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2263   __ bind(after_transition);
2264 
2265   Label reguard;
2266   Label reguard_done;
2267   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2268   __ jcc(Assembler::equal, reguard);
2269   __ bind(reguard_done);
2270 
2271   // native result if any is live
2272 
2273   // Unlock
2274   Label slow_path_unlock;
2275   Label unlock_done;
2276   if (method->is_synchronized()) {
2277 
2278     Label fast_done;
2279 
2280     // Get locked oop from the handle we passed to jni
2281     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2282 
2283     if (LockingMode == LM_LEGACY) {
2284       Label not_recur;
2285       // Simple recursive lock?
2286       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2287       __ jcc(Assembler::notEqual, not_recur);
2288       __ dec_held_monitor_count();
2289       __ jmpb(fast_done);
2290       __ bind(not_recur);
2291     }
2292 
2293     // Must save rax if it is live now because cmpxchg must use it
2294     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2295       save_native_result(masm, ret_type, stack_slots);
2296     }
2297 
2298     if (LockingMode == LM_MONITOR) {
2299       __ jmp(slow_path_unlock);
2300     } else if (LockingMode == LM_LEGACY) {
2301       // get address of the stack lock
2302       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2303       //  get old displaced header
2304       __ movptr(old_hdr, Address(rax, 0));
2305 
2306       // Atomic swap old header if oop still contains the stack lock
2307       __ lock();
2308       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2309       __ jcc(Assembler::notEqual, slow_path_unlock);
2310       __ dec_held_monitor_count();
2311     } else {
2312       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2313       __ lightweight_unlock(obj_reg, swap_reg, r15_thread, lock_reg, slow_path_unlock);
2314       __ dec_held_monitor_count();
2315     }
2316 
2317     // slow path re-enters here
2318     __ bind(unlock_done);
2319     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2320       restore_native_result(masm, ret_type, stack_slots);
2321     }
2322 
2323     __ bind(fast_done);
2324   }
2325   {
2326     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2327     save_native_result(masm, ret_type, stack_slots);
2328     __ mov_metadata(c_rarg1, method());
2329     __ call_VM_leaf(
2330          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2331          r15_thread, c_rarg1);
2332     restore_native_result(masm, ret_type, stack_slots);
2333   }
2334 
2335   __ reset_last_Java_frame(false);
2336 
2337   // Unbox oop result, e.g. JNIHandles::resolve value.
2338   if (is_reference_type(ret_type)) {
2339     __ resolve_jobject(rax /* value */,
2340                        r15_thread /* thread */,
2341                        rcx /* tmp */);
2342   }
2343 
2344   if (CheckJNICalls) {
2345     // clear_pending_jni_exception_check
2346     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2347   }
2348 
2349   // reset handle block
2350   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2351   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2352 
2353   // pop our frame
2354 
2355   __ leave();
2356 
2357   // Any exception pending?
2358   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2359   __ jcc(Assembler::notEqual, exception_pending);
2360 
2361   // Return
2362 
2363   __ ret(0);
2364 
2365   // Unexpected paths are out of line and go here
2366 
2367   // forward the exception
2368   __ bind(exception_pending);
2369 
2370   // and forward the exception
2371   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2372 
2373   // Slow path locking & unlocking
2374   if (method->is_synchronized()) {
2375 
2376     // BEGIN Slow path lock
2377     __ bind(slow_path_lock);
2378 
2379     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2380     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2381 
2382     // protect the args we've loaded
2383     save_args(masm, total_c_args, c_arg, out_regs);
2384 
2385     __ mov(c_rarg0, obj_reg);
2386     __ mov(c_rarg1, lock_reg);
2387     __ mov(c_rarg2, r15_thread);
2388 
2389     // Not a leaf but we have last_Java_frame setup as we want
2390     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2391     restore_args(masm, total_c_args, c_arg, out_regs);
2392 
2393 #ifdef ASSERT
2394     { Label L;
2395     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2396     __ jcc(Assembler::equal, L);
2397     __ stop("no pending exception allowed on exit from monitorenter");
2398     __ bind(L);
2399     }
2400 #endif
2401     __ jmp(lock_done);
2402 
2403     // END Slow path lock
2404 
2405     // BEGIN Slow path unlock
2406     __ bind(slow_path_unlock);
2407 
2408     // If we haven't already saved the native result we must save it now as xmm registers
2409     // are still exposed.
2410     __ vzeroupper();
2411     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2412       save_native_result(masm, ret_type, stack_slots);
2413     }
2414 
2415     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2416 
2417     __ mov(c_rarg0, obj_reg);
2418     __ mov(c_rarg2, r15_thread);
2419     __ mov(r12, rsp); // remember sp
2420     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2421     __ andptr(rsp, -16); // align stack as required by ABI
2422 
2423     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2424     // NOTE that obj_reg == rbx currently
2425     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2426     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2427 
2428     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2429     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2430     __ mov(rsp, r12); // restore sp
2431     __ reinit_heapbase();
2432 #ifdef ASSERT
2433     {
2434       Label L;
2435       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2436       __ jcc(Assembler::equal, L);
2437       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2438       __ bind(L);
2439     }
2440 #endif /* ASSERT */
2441 
2442     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2443 
2444     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2445       restore_native_result(masm, ret_type, stack_slots);
2446     }
2447     __ jmp(unlock_done);
2448 
2449     // END Slow path unlock
2450 
2451   } // synchronized
2452 
2453   // SLOW PATH Reguard the stack if needed
2454 
2455   __ bind(reguard);
2456   __ vzeroupper();
2457   save_native_result(masm, ret_type, stack_slots);
2458   __ mov(r12, rsp); // remember sp
2459   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2460   __ andptr(rsp, -16); // align stack as required by ABI
2461   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2462   __ mov(rsp, r12); // restore sp
2463   __ reinit_heapbase();
2464   restore_native_result(masm, ret_type, stack_slots);
2465   // and continue
2466   __ jmp(reguard_done);
2467 
2468 
2469 
2470   __ flush();
2471 
2472   nmethod *nm = nmethod::new_native_nmethod(method,
2473                                             compile_id,
2474                                             masm->code(),
2475                                             vep_offset,
2476                                             frame_complete,
2477                                             stack_slots / VMRegImpl::slots_per_word,
2478                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2479                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2480                                             oop_maps);
2481 
2482   return nm;
2483 }
2484 
2485 // this function returns the adjust size (in number of words) to a c2i adapter
2486 // activation for use during deoptimization
2487 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2488   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2489 }
2490 
2491 
2492 uint SharedRuntime::out_preserve_stack_slots() {
2493   return 0;
2494 }
2495 
2496 
2497 // Number of stack slots between incoming argument block and the start of
2498 // a new frame.  The PROLOG must add this many slots to the stack.  The
2499 // EPILOG must remove this many slots.  amd64 needs two slots for
2500 // return address.
2501 uint SharedRuntime::in_preserve_stack_slots() {
2502   return 4 + 2 * VerifyStackAtCalls;
2503 }
2504 
2505 //------------------------------generate_deopt_blob----------------------------
2506 void SharedRuntime::generate_deopt_blob() {
2507   // Allocate space for the code
2508   ResourceMark rm;
2509   // Setup code generation tools
2510   int pad = 0;
2511   if (UseAVX > 2) {
2512     pad += 1024;
2513   }
2514 #if INCLUDE_JVMCI
2515   if (EnableJVMCI) {
2516     pad += 512; // Increase the buffer size when compiling for JVMCI
2517   }
2518 #endif
2519   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2520   MacroAssembler* masm = new MacroAssembler(&buffer);
2521   int frame_size_in_words;
2522   OopMap* map = nullptr;
2523   OopMapSet *oop_maps = new OopMapSet();
2524 
2525   // -------------
2526   // This code enters when returning to a de-optimized nmethod.  A return
2527   // address has been pushed on the stack, and return values are in
2528   // registers.
2529   // If we are doing a normal deopt then we were called from the patched
2530   // nmethod from the point we returned to the nmethod. So the return
2531   // address on the stack is wrong by NativeCall::instruction_size
2532   // We will adjust the value so it looks like we have the original return
2533   // address on the stack (like when we eagerly deoptimized).
2534   // In the case of an exception pending when deoptimizing, we enter
2535   // with a return address on the stack that points after the call we patched
2536   // into the exception handler. We have the following register state from,
2537   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2538   //    rax: exception oop
2539   //    rbx: exception handler
2540   //    rdx: throwing pc
2541   // So in this case we simply jam rdx into the useless return address and
2542   // the stack looks just like we want.
2543   //
2544   // At this point we need to de-opt.  We save the argument return
2545   // registers.  We call the first C routine, fetch_unroll_info().  This
2546   // routine captures the return values and returns a structure which
2547   // describes the current frame size and the sizes of all replacement frames.
2548   // The current frame is compiled code and may contain many inlined
2549   // functions, each with their own JVM state.  We pop the current frame, then
2550   // push all the new frames.  Then we call the C routine unpack_frames() to
2551   // populate these frames.  Finally unpack_frames() returns us the new target
2552   // address.  Notice that callee-save registers are BLOWN here; they have
2553   // already been captured in the vframeArray at the time the return PC was
2554   // patched.
2555   address start = __ pc();
2556   Label cont;
2557 
2558   // Prolog for non exception case!
2559 
2560   // Save everything in sight.
2561   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2562 
2563   // Normal deoptimization.  Save exec mode for unpack_frames.
2564   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2565   __ jmp(cont);
2566 
2567   int reexecute_offset = __ pc() - start;
2568 #if INCLUDE_JVMCI && !defined(COMPILER1)
2569   if (EnableJVMCI && UseJVMCICompiler) {
2570     // JVMCI does not use this kind of deoptimization
2571     __ should_not_reach_here();
2572   }
2573 #endif
2574 
2575   // Reexecute case
2576   // return address is the pc describes what bci to do re-execute at
2577 
2578   // No need to update map as each call to save_live_registers will produce identical oopmap
2579   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2580 
2581   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2582   __ jmp(cont);
2583 
2584 #if INCLUDE_JVMCI
2585   Label after_fetch_unroll_info_call;
2586   int implicit_exception_uncommon_trap_offset = 0;
2587   int uncommon_trap_offset = 0;
2588 
2589   if (EnableJVMCI) {
2590     implicit_exception_uncommon_trap_offset = __ pc() - start;
2591 
2592     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2593     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2594 
2595     uncommon_trap_offset = __ pc() - start;
2596 
2597     // Save everything in sight.
2598     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2599     // fetch_unroll_info needs to call last_java_frame()
2600     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2601 
2602     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2603     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2604 
2605     __ movl(r14, Deoptimization::Unpack_reexecute);
2606     __ mov(c_rarg0, r15_thread);
2607     __ movl(c_rarg2, r14); // exec mode
2608     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2609     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2610 
2611     __ reset_last_Java_frame(false);
2612 
2613     __ jmp(after_fetch_unroll_info_call);
2614   } // EnableJVMCI
2615 #endif // INCLUDE_JVMCI
2616 
2617   int exception_offset = __ pc() - start;
2618 
2619   // Prolog for exception case
2620 
2621   // all registers are dead at this entry point, except for rax, and
2622   // rdx which contain the exception oop and exception pc
2623   // respectively.  Set them in TLS and fall thru to the
2624   // unpack_with_exception_in_tls entry point.
2625 
2626   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2627   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2628 
2629   int exception_in_tls_offset = __ pc() - start;
2630 
2631   // new implementation because exception oop is now passed in JavaThread
2632 
2633   // Prolog for exception case
2634   // All registers must be preserved because they might be used by LinearScan
2635   // Exceptiop oop and throwing PC are passed in JavaThread
2636   // tos: stack at point of call to method that threw the exception (i.e. only
2637   // args are on the stack, no return address)
2638 
2639   // make room on stack for the return address
2640   // It will be patched later with the throwing pc. The correct value is not
2641   // available now because loading it from memory would destroy registers.
2642   __ push(0);
2643 
2644   // Save everything in sight.
2645   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2646 
2647   // Now it is safe to overwrite any register
2648 
2649   // Deopt during an exception.  Save exec mode for unpack_frames.
2650   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2651 
2652   // load throwing pc from JavaThread and patch it as the return address
2653   // of the current frame. Then clear the field in JavaThread
2654 
2655   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2656   __ movptr(Address(rbp, wordSize), rdx);
2657   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2658 
2659 #ifdef ASSERT
2660   // verify that there is really an exception oop in JavaThread
2661   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2662   __ verify_oop(rax);
2663 
2664   // verify that there is no pending exception
2665   Label no_pending_exception;
2666   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2667   __ testptr(rax, rax);
2668   __ jcc(Assembler::zero, no_pending_exception);
2669   __ stop("must not have pending exception here");
2670   __ bind(no_pending_exception);
2671 #endif
2672 
2673   __ bind(cont);
2674 
2675   // Call C code.  Need thread and this frame, but NOT official VM entry
2676   // crud.  We cannot block on this call, no GC can happen.
2677   //
2678   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2679 
2680   // fetch_unroll_info needs to call last_java_frame().
2681 
2682   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2683 #ifdef ASSERT
2684   { Label L;
2685     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2686     __ jcc(Assembler::equal, L);
2687     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2688     __ bind(L);
2689   }
2690 #endif // ASSERT
2691   __ mov(c_rarg0, r15_thread);
2692   __ movl(c_rarg1, r14); // exec_mode
2693   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2694 
2695   // Need to have an oopmap that tells fetch_unroll_info where to
2696   // find any register it might need.
2697   oop_maps->add_gc_map(__ pc() - start, map);
2698 
2699   __ reset_last_Java_frame(false);
2700 
2701 #if INCLUDE_JVMCI
2702   if (EnableJVMCI) {
2703     __ bind(after_fetch_unroll_info_call);
2704   }
2705 #endif
2706 
2707   // Load UnrollBlock* into rdi
2708   __ mov(rdi, rax);
2709 
2710   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2711    Label noException;
2712   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2713   __ jcc(Assembler::notEqual, noException);
2714   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2715   // QQQ this is useless it was null above
2716   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2717   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2718   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2719 
2720   __ verify_oop(rax);
2721 
2722   // Overwrite the result registers with the exception results.
2723   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2724   // I think this is useless
2725   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2726 
2727   __ bind(noException);
2728 
2729   // Only register save data is on the stack.
2730   // Now restore the result registers.  Everything else is either dead
2731   // or captured in the vframeArray.
2732   RegisterSaver::restore_result_registers(masm);
2733 
2734   // All of the register save area has been popped of the stack. Only the
2735   // return address remains.
2736 
2737   // Pop all the frames we must move/replace.
2738   //
2739   // Frame picture (youngest to oldest)
2740   // 1: self-frame (no frame link)
2741   // 2: deopting frame  (no frame link)
2742   // 3: caller of deopting frame (could be compiled/interpreted).
2743   //
2744   // Note: by leaving the return address of self-frame on the stack
2745   // and using the size of frame 2 to adjust the stack
2746   // when we are done the return to frame 3 will still be on the stack.
2747 
2748   // Pop deoptimized frame
2749   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2750   __ addptr(rsp, rcx);
2751 
2752   // rsp should be pointing at the return address to the caller (3)
2753 
2754   // Pick up the initial fp we should save
2755   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2756   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2757 
2758 #ifdef ASSERT
2759   // Compilers generate code that bang the stack by as much as the
2760   // interpreter would need. So this stack banging should never
2761   // trigger a fault. Verify that it does not on non product builds.
2762   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2763   __ bang_stack_size(rbx, rcx);
2764 #endif
2765 
2766   // Load address of array of frame pcs into rcx
2767   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2768 
2769   // Trash the old pc
2770   __ addptr(rsp, wordSize);
2771 
2772   // Load address of array of frame sizes into rsi
2773   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2774 
2775   // Load counter into rdx
2776   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2777 
2778   // Now adjust the caller's stack to make up for the extra locals
2779   // but record the original sp so that we can save it in the skeletal interpreter
2780   // frame and the stack walking of interpreter_sender will get the unextended sp
2781   // value and not the "real" sp value.
2782 
2783   const Register sender_sp = r8;
2784 
2785   __ mov(sender_sp, rsp);
2786   __ movl(rbx, Address(rdi,
2787                        Deoptimization::UnrollBlock::
2788                        caller_adjustment_offset()));
2789   __ subptr(rsp, rbx);
2790 
2791   // Push interpreter frames in a loop
2792   Label loop;
2793   __ bind(loop);
2794   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2795   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2796   __ pushptr(Address(rcx, 0));          // Save return address
2797   __ enter();                           // Save old & set new ebp
2798   __ subptr(rsp, rbx);                  // Prolog
2799   // This value is corrected by layout_activation_impl
2800   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2801   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2802   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2803   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2804   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2805   __ decrementl(rdx);                   // Decrement counter
2806   __ jcc(Assembler::notZero, loop);
2807   __ pushptr(Address(rcx, 0));          // Save final return address
2808 
2809   // Re-push self-frame
2810   __ enter();                           // Save old & set new ebp
2811 
2812   // Allocate a full sized register save area.
2813   // Return address and rbp are in place, so we allocate two less words.
2814   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2815 
2816   // Restore frame locals after moving the frame
2817   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2818   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2819 
2820   // Call C code.  Need thread but NOT official VM entry
2821   // crud.  We cannot block on this call, no GC can happen.  Call should
2822   // restore return values to their stack-slots with the new SP.
2823   //
2824   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2825 
2826   // Use rbp because the frames look interpreted now
2827   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2828   // Don't need the precise return PC here, just precise enough to point into this code blob.
2829   address the_pc = __ pc();
2830   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2831 
2832   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2833   __ mov(c_rarg0, r15_thread);
2834   __ movl(c_rarg1, r14); // second arg: exec_mode
2835   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2836   // Revert SP alignment after call since we're going to do some SP relative addressing below
2837   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2838 
2839   // Set an oopmap for the call site
2840   // Use the same PC we used for the last java frame
2841   oop_maps->add_gc_map(the_pc - start,
2842                        new OopMap( frame_size_in_words, 0 ));
2843 
2844   // Clear fp AND pc
2845   __ reset_last_Java_frame(true);
2846 
2847   // Collect return values
2848   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2849   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2850   // I think this is useless (throwing pc?)
2851   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2852 
2853   // Pop self-frame.
2854   __ leave();                           // Epilog
2855 
2856   // Jump to interpreter
2857   __ ret(0);
2858 
2859   // Make sure all code is generated
2860   masm->flush();
2861 
2862   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2863   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2864 #if INCLUDE_JVMCI
2865   if (EnableJVMCI) {
2866     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2867     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2868   }
2869 #endif
2870 }
2871 
2872 #ifdef COMPILER2
2873 //------------------------------generate_uncommon_trap_blob--------------------
2874 void SharedRuntime::generate_uncommon_trap_blob() {
2875   // Allocate space for the code
2876   ResourceMark rm;
2877   // Setup code generation tools
2878   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2879   MacroAssembler* masm = new MacroAssembler(&buffer);
2880 
2881   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2882 
2883   address start = __ pc();
2884 
2885   if (UseRTMLocking) {
2886     // Abort RTM transaction before possible nmethod deoptimization.
2887     __ xabort(0);
2888   }
2889 
2890   // Push self-frame.  We get here with a return address on the
2891   // stack, so rsp is 8-byte aligned until we allocate our frame.
2892   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2893 
2894   // No callee saved registers. rbp is assumed implicitly saved
2895   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2896 
2897   // compiler left unloaded_class_index in j_rarg0 move to where the
2898   // runtime expects it.
2899   __ movl(c_rarg1, j_rarg0);
2900 
2901   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2902 
2903   // Call C code.  Need thread but NOT official VM entry
2904   // crud.  We cannot block on this call, no GC can happen.  Call should
2905   // capture callee-saved registers as well as return values.
2906   // Thread is in rdi already.
2907   //
2908   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2909 
2910   __ mov(c_rarg0, r15_thread);
2911   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2912   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2913 
2914   // Set an oopmap for the call site
2915   OopMapSet* oop_maps = new OopMapSet();
2916   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2917 
2918   // location of rbp is known implicitly by the frame sender code
2919 
2920   oop_maps->add_gc_map(__ pc() - start, map);
2921 
2922   __ reset_last_Java_frame(false);
2923 
2924   // Load UnrollBlock* into rdi
2925   __ mov(rdi, rax);
2926 
2927 #ifdef ASSERT
2928   { Label L;
2929     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
2930               Deoptimization::Unpack_uncommon_trap);
2931     __ jcc(Assembler::equal, L);
2932     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
2933     __ bind(L);
2934   }
2935 #endif
2936 
2937   // Pop all the frames we must move/replace.
2938   //
2939   // Frame picture (youngest to oldest)
2940   // 1: self-frame (no frame link)
2941   // 2: deopting frame  (no frame link)
2942   // 3: caller of deopting frame (could be compiled/interpreted).
2943 
2944   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2945   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2946 
2947   // Pop deoptimized frame (int)
2948   __ movl(rcx, Address(rdi,
2949                        Deoptimization::UnrollBlock::
2950                        size_of_deoptimized_frame_offset()));
2951   __ addptr(rsp, rcx);
2952 
2953   // rsp should be pointing at the return address to the caller (3)
2954 
2955   // Pick up the initial fp we should save
2956   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2957   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2958 
2959 #ifdef ASSERT
2960   // Compilers generate code that bang the stack by as much as the
2961   // interpreter would need. So this stack banging should never
2962   // trigger a fault. Verify that it does not on non product builds.
2963   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2964   __ bang_stack_size(rbx, rcx);
2965 #endif
2966 
2967   // Load address of array of frame pcs into rcx (address*)
2968   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2969 
2970   // Trash the return pc
2971   __ addptr(rsp, wordSize);
2972 
2973   // Load address of array of frame sizes into rsi (intptr_t*)
2974   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
2975 
2976   // Counter
2977   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
2978 
2979   // Now adjust the caller's stack to make up for the extra locals but
2980   // record the original sp so that we can save it in the skeletal
2981   // interpreter frame and the stack walking of interpreter_sender
2982   // will get the unextended sp value and not the "real" sp value.
2983 
2984   const Register sender_sp = r8;
2985 
2986   __ mov(sender_sp, rsp);
2987   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
2988   __ subptr(rsp, rbx);
2989 
2990   // Push interpreter frames in a loop
2991   Label loop;
2992   __ bind(loop);
2993   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2994   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2995   __ pushptr(Address(rcx, 0));     // Save return address
2996   __ enter();                      // Save old & set new rbp
2997   __ subptr(rsp, rbx);             // Prolog
2998   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2999             sender_sp);            // Make it walkable
3000   // This value is corrected by layout_activation_impl
3001   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3002   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3003   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3004   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3005   __ decrementl(rdx);              // Decrement counter
3006   __ jcc(Assembler::notZero, loop);
3007   __ pushptr(Address(rcx, 0));     // Save final return address
3008 
3009   // Re-push self-frame
3010   __ enter();                 // Save old & set new rbp
3011   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3012                               // Prolog
3013 
3014   // Use rbp because the frames look interpreted now
3015   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3016   // Don't need the precise return PC here, just precise enough to point into this code blob.
3017   address the_pc = __ pc();
3018   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3019 
3020   // Call C code.  Need thread but NOT official VM entry
3021   // crud.  We cannot block on this call, no GC can happen.  Call should
3022   // restore return values to their stack-slots with the new SP.
3023   // Thread is in rdi already.
3024   //
3025   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3026 
3027   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3028   __ mov(c_rarg0, r15_thread);
3029   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3030   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3031 
3032   // Set an oopmap for the call site
3033   // Use the same PC we used for the last java frame
3034   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3035 
3036   // Clear fp AND pc
3037   __ reset_last_Java_frame(true);
3038 
3039   // Pop self-frame.
3040   __ leave();                 // Epilog
3041 
3042   // Jump to interpreter
3043   __ ret(0);
3044 
3045   // Make sure all code is generated
3046   masm->flush();
3047 
3048   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3049                                                  SimpleRuntimeFrame::framesize >> 1);
3050 }
3051 #endif // COMPILER2
3052 
3053 //------------------------------generate_handler_blob------
3054 //
3055 // Generate a special Compile2Runtime blob that saves all registers,
3056 // and setup oopmap.
3057 //
3058 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3059   assert(StubRoutines::forward_exception_entry() != nullptr,
3060          "must be generated before");
3061 
3062   ResourceMark rm;
3063   OopMapSet *oop_maps = new OopMapSet();
3064   OopMap* map;
3065 
3066   // Allocate space for the code.  Setup code generation tools.
3067   CodeBuffer buffer("handler_blob", 2048, 1024);
3068   MacroAssembler* masm = new MacroAssembler(&buffer);
3069 
3070   address start   = __ pc();
3071   address call_pc = nullptr;
3072   int frame_size_in_words;
3073   bool cause_return = (poll_type == POLL_AT_RETURN);
3074   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3075 
3076   if (UseRTMLocking) {
3077     // Abort RTM transaction before calling runtime
3078     // because critical section will be large and will be
3079     // aborted anyway. Also nmethod could be deoptimized.
3080     __ xabort(0);
3081   }
3082 
3083   // Make room for return address (or push it again)
3084   if (!cause_return) {
3085     __ push(rbx);
3086   }
3087 
3088   // Save registers, fpu state, and flags
3089   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3090 
3091   // The following is basically a call_VM.  However, we need the precise
3092   // address of the call in order to generate an oopmap. Hence, we do all the
3093   // work ourselves.
3094 
3095   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3096 
3097   // The return address must always be correct so that frame constructor never
3098   // sees an invalid pc.
3099 
3100   if (!cause_return) {
3101     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3102     // Additionally, rbx is a callee saved register and we can look at it later to determine
3103     // if someone changed the return address for us!
3104     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3105     __ movptr(Address(rbp, wordSize), rbx);
3106   }
3107 
3108   // Do the call
3109   __ mov(c_rarg0, r15_thread);
3110   __ call(RuntimeAddress(call_ptr));
3111 
3112   // Set an oopmap for the call site.  This oopmap will map all
3113   // oop-registers and debug-info registers as callee-saved.  This
3114   // will allow deoptimization at this safepoint to find all possible
3115   // debug-info recordings, as well as let GC find all oops.
3116 
3117   oop_maps->add_gc_map( __ pc() - start, map);
3118 
3119   Label noException;
3120 
3121   __ reset_last_Java_frame(false);
3122 
3123   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3124   __ jcc(Assembler::equal, noException);
3125 
3126   // Exception pending
3127 
3128   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3129 
3130   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3131 
3132   // No exception case
3133   __ bind(noException);
3134 
3135   Label no_adjust;
3136 #ifdef ASSERT
3137   Label bail;
3138 #endif
3139   if (!cause_return) {
3140     Label no_prefix, not_special;
3141 
3142     // If our stashed return pc was modified by the runtime we avoid touching it
3143     __ cmpptr(rbx, Address(rbp, wordSize));
3144     __ jccb(Assembler::notEqual, no_adjust);
3145 
3146     // Skip over the poll instruction.
3147     // See NativeInstruction::is_safepoint_poll()
3148     // Possible encodings:
3149     //      85 00       test   %eax,(%rax)
3150     //      85 01       test   %eax,(%rcx)
3151     //      85 02       test   %eax,(%rdx)
3152     //      85 03       test   %eax,(%rbx)
3153     //      85 06       test   %eax,(%rsi)
3154     //      85 07       test   %eax,(%rdi)
3155     //
3156     //   41 85 00       test   %eax,(%r8)
3157     //   41 85 01       test   %eax,(%r9)
3158     //   41 85 02       test   %eax,(%r10)
3159     //   41 85 03       test   %eax,(%r11)
3160     //   41 85 06       test   %eax,(%r14)
3161     //   41 85 07       test   %eax,(%r15)
3162     //
3163     //      85 04 24    test   %eax,(%rsp)
3164     //   41 85 04 24    test   %eax,(%r12)
3165     //      85 45 00    test   %eax,0x0(%rbp)
3166     //   41 85 45 00    test   %eax,0x0(%r13)
3167 
3168     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3169     __ jcc(Assembler::notEqual, no_prefix);
3170     __ addptr(rbx, 1);
3171     __ bind(no_prefix);
3172 #ifdef ASSERT
3173     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3174 #endif
3175     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3176     // r12/rsp 0x04
3177     // r13/rbp 0x05
3178     __ movzbq(rcx, Address(rbx, 1));
3179     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3180     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3181     __ cmpptr(rcx, 1);
3182     __ jcc(Assembler::above, not_special);
3183     __ addptr(rbx, 1);
3184     __ bind(not_special);
3185 #ifdef ASSERT
3186     // Verify the correct encoding of the poll we're about to skip.
3187     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3188     __ jcc(Assembler::notEqual, bail);
3189     // Mask out the modrm bits
3190     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3191     // rax encodes to 0, so if the bits are nonzero it's incorrect
3192     __ jcc(Assembler::notZero, bail);
3193 #endif
3194     // Adjust return pc forward to step over the safepoint poll instruction
3195     __ addptr(rbx, 2);
3196     __ movptr(Address(rbp, wordSize), rbx);
3197   }
3198 
3199   __ bind(no_adjust);
3200   // Normal exit, restore registers and exit.
3201   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3202   __ ret(0);
3203 
3204 #ifdef ASSERT
3205   __ bind(bail);
3206   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3207 #endif
3208 
3209   // Make sure all code is generated
3210   masm->flush();
3211 
3212   // Fill-out other meta info
3213   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3214 }
3215 
3216 //
3217 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3218 //
3219 // Generate a stub that calls into vm to find out the proper destination
3220 // of a java call. All the argument registers are live at this point
3221 // but since this is generic code we don't know what they are and the caller
3222 // must do any gc of the args.
3223 //
3224 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3225   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3226 
3227   // allocate space for the code
3228   ResourceMark rm;
3229 
3230   CodeBuffer buffer(name, 1200, 512);
3231   MacroAssembler* masm = new MacroAssembler(&buffer);
3232 
3233   int frame_size_in_words;
3234 
3235   OopMapSet *oop_maps = new OopMapSet();
3236   OopMap* map = nullptr;
3237 
3238   int start = __ offset();
3239 
3240   // No need to save vector registers since they are caller-saved anyway.
3241   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3242 
3243   int frame_complete = __ offset();
3244 
3245   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3246 
3247   __ mov(c_rarg0, r15_thread);
3248 
3249   __ call(RuntimeAddress(destination));
3250 
3251 
3252   // Set an oopmap for the call site.
3253   // We need this not only for callee-saved registers, but also for volatile
3254   // registers that the compiler might be keeping live across a safepoint.
3255 
3256   oop_maps->add_gc_map( __ offset() - start, map);
3257 
3258   // rax contains the address we are going to jump to assuming no exception got installed
3259 
3260   // clear last_Java_sp
3261   __ reset_last_Java_frame(false);
3262   // check for pending exceptions
3263   Label pending;
3264   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3265   __ jcc(Assembler::notEqual, pending);
3266 
3267   // get the returned Method*
3268   __ get_vm_result_2(rbx, r15_thread);
3269   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3270 
3271   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3272 
3273   RegisterSaver::restore_live_registers(masm);
3274 
3275   // We are back to the original state on entry and ready to go.
3276 
3277   __ jmp(rax);
3278 
3279   // Pending exception after the safepoint
3280 
3281   __ bind(pending);
3282 
3283   RegisterSaver::restore_live_registers(masm);
3284 
3285   // exception pending => remove activation and forward to exception handler
3286 
3287   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3288 
3289   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3290   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3291 
3292   // -------------
3293   // make sure all code is generated
3294   masm->flush();
3295 
3296   // return the  blob
3297   // frame_size_words or bytes??
3298   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3299 }
3300 
3301 //------------------------------Montgomery multiplication------------------------
3302 //
3303 
3304 #ifndef _WINDOWS
3305 
3306 // Subtract 0:b from carry:a.  Return carry.
3307 static julong
3308 sub(julong a[], julong b[], julong carry, long len) {
3309   long long i = 0, cnt = len;
3310   julong tmp;
3311   asm volatile("clc; "
3312                "0: ; "
3313                "mov (%[b], %[i], 8), %[tmp]; "
3314                "sbb %[tmp], (%[a], %[i], 8); "
3315                "inc %[i]; dec %[cnt]; "
3316                "jne 0b; "
3317                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3318                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3319                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3320                : "memory");
3321   return tmp;
3322 }
3323 
3324 // Multiply (unsigned) Long A by Long B, accumulating the double-
3325 // length result into the accumulator formed of T0, T1, and T2.
3326 #define MACC(A, B, T0, T1, T2)                                  \
3327 do {                                                            \
3328   unsigned long hi, lo;                                         \
3329   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3330            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3331            : "r"(A), "a"(B) : "cc");                            \
3332  } while(0)
3333 
3334 // As above, but add twice the double-length result into the
3335 // accumulator.
3336 #define MACC2(A, B, T0, T1, T2)                                 \
3337 do {                                                            \
3338   unsigned long hi, lo;                                         \
3339   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3340            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3341            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3342            : "r"(A), "a"(B) : "cc");                            \
3343  } while(0)
3344 
3345 #else //_WINDOWS
3346 
3347 static julong
3348 sub(julong a[], julong b[], julong carry, long len) {
3349   long i;
3350   julong tmp;
3351   unsigned char c = 1;
3352   for (i = 0; i < len; i++) {
3353     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3354     a[i] = tmp;
3355   }
3356   c = _addcarry_u64(c, carry, ~0, &tmp);
3357   return tmp;
3358 }
3359 
3360 // Multiply (unsigned) Long A by Long B, accumulating the double-
3361 // length result into the accumulator formed of T0, T1, and T2.
3362 #define MACC(A, B, T0, T1, T2)                          \
3363 do {                                                    \
3364   julong hi, lo;                            \
3365   lo = _umul128(A, B, &hi);                             \
3366   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3367   c = _addcarry_u64(c, hi, T1, &T1);                    \
3368   _addcarry_u64(c, T2, 0, &T2);                         \
3369  } while(0)
3370 
3371 // As above, but add twice the double-length result into the
3372 // accumulator.
3373 #define MACC2(A, B, T0, T1, T2)                         \
3374 do {                                                    \
3375   julong hi, lo;                            \
3376   lo = _umul128(A, B, &hi);                             \
3377   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3378   c = _addcarry_u64(c, hi, T1, &T1);                    \
3379   _addcarry_u64(c, T2, 0, &T2);                         \
3380   c = _addcarry_u64(0, lo, T0, &T0);                    \
3381   c = _addcarry_u64(c, hi, T1, &T1);                    \
3382   _addcarry_u64(c, T2, 0, &T2);                         \
3383  } while(0)
3384 
3385 #endif //_WINDOWS
3386 
3387 // Fast Montgomery multiplication.  The derivation of the algorithm is
3388 // in  A Cryptographic Library for the Motorola DSP56000,
3389 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3390 
3391 static void NOINLINE
3392 montgomery_multiply(julong a[], julong b[], julong n[],
3393                     julong m[], julong inv, int len) {
3394   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3395   int i;
3396 
3397   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3398 
3399   for (i = 0; i < len; i++) {
3400     int j;
3401     for (j = 0; j < i; j++) {
3402       MACC(a[j], b[i-j], t0, t1, t2);
3403       MACC(m[j], n[i-j], t0, t1, t2);
3404     }
3405     MACC(a[i], b[0], t0, t1, t2);
3406     m[i] = t0 * inv;
3407     MACC(m[i], n[0], t0, t1, t2);
3408 
3409     assert(t0 == 0, "broken Montgomery multiply");
3410 
3411     t0 = t1; t1 = t2; t2 = 0;
3412   }
3413 
3414   for (i = len; i < 2*len; i++) {
3415     int j;
3416     for (j = i-len+1; j < len; j++) {
3417       MACC(a[j], b[i-j], t0, t1, t2);
3418       MACC(m[j], n[i-j], t0, t1, t2);
3419     }
3420     m[i-len] = t0;
3421     t0 = t1; t1 = t2; t2 = 0;
3422   }
3423 
3424   while (t0)
3425     t0 = sub(m, n, t0, len);
3426 }
3427 
3428 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3429 // multiplies so it should be up to 25% faster than Montgomery
3430 // multiplication.  However, its loop control is more complex and it
3431 // may actually run slower on some machines.
3432 
3433 static void NOINLINE
3434 montgomery_square(julong a[], julong n[],
3435                   julong m[], julong inv, int len) {
3436   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3437   int i;
3438 
3439   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3440 
3441   for (i = 0; i < len; i++) {
3442     int j;
3443     int end = (i+1)/2;
3444     for (j = 0; j < end; j++) {
3445       MACC2(a[j], a[i-j], t0, t1, t2);
3446       MACC(m[j], n[i-j], t0, t1, t2);
3447     }
3448     if ((i & 1) == 0) {
3449       MACC(a[j], a[j], t0, t1, t2);
3450     }
3451     for (; j < i; j++) {
3452       MACC(m[j], n[i-j], t0, t1, t2);
3453     }
3454     m[i] = t0 * inv;
3455     MACC(m[i], n[0], t0, t1, t2);
3456 
3457     assert(t0 == 0, "broken Montgomery square");
3458 
3459     t0 = t1; t1 = t2; t2 = 0;
3460   }
3461 
3462   for (i = len; i < 2*len; i++) {
3463     int start = i-len+1;
3464     int end = start + (len - start)/2;
3465     int j;
3466     for (j = start; j < end; j++) {
3467       MACC2(a[j], a[i-j], t0, t1, t2);
3468       MACC(m[j], n[i-j], t0, t1, t2);
3469     }
3470     if ((i & 1) == 0) {
3471       MACC(a[j], a[j], t0, t1, t2);
3472     }
3473     for (; j < len; j++) {
3474       MACC(m[j], n[i-j], t0, t1, t2);
3475     }
3476     m[i-len] = t0;
3477     t0 = t1; t1 = t2; t2 = 0;
3478   }
3479 
3480   while (t0)
3481     t0 = sub(m, n, t0, len);
3482 }
3483 
3484 // Swap words in a longword.
3485 static julong swap(julong x) {
3486   return (x << 32) | (x >> 32);
3487 }
3488 
3489 // Copy len longwords from s to d, word-swapping as we go.  The
3490 // destination array is reversed.
3491 static void reverse_words(julong *s, julong *d, int len) {
3492   d += len;
3493   while(len-- > 0) {
3494     d--;
3495     *d = swap(*s);
3496     s++;
3497   }
3498 }
3499 
3500 // The threshold at which squaring is advantageous was determined
3501 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3502 #define MONTGOMERY_SQUARING_THRESHOLD 64
3503 
3504 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3505                                         jint len, jlong inv,
3506                                         jint *m_ints) {
3507   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3508   int longwords = len/2;
3509 
3510   // Make very sure we don't use so much space that the stack might
3511   // overflow.  512 jints corresponds to an 16384-bit integer and
3512   // will use here a total of 8k bytes of stack space.
3513   int divisor = sizeof(julong) * 4;
3514   guarantee(longwords <= 8192 / divisor, "must be");
3515   int total_allocation = longwords * sizeof (julong) * 4;
3516   julong *scratch = (julong *)alloca(total_allocation);
3517 
3518   // Local scratch arrays
3519   julong
3520     *a = scratch + 0 * longwords,
3521     *b = scratch + 1 * longwords,
3522     *n = scratch + 2 * longwords,
3523     *m = scratch + 3 * longwords;
3524 
3525   reverse_words((julong *)a_ints, a, longwords);
3526   reverse_words((julong *)b_ints, b, longwords);
3527   reverse_words((julong *)n_ints, n, longwords);
3528 
3529   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3530 
3531   reverse_words(m, (julong *)m_ints, longwords);
3532 }
3533 
3534 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3535                                       jint len, jlong inv,
3536                                       jint *m_ints) {
3537   assert(len % 2 == 0, "array length in montgomery_square must be even");
3538   int longwords = len/2;
3539 
3540   // Make very sure we don't use so much space that the stack might
3541   // overflow.  512 jints corresponds to an 16384-bit integer and
3542   // will use here a total of 6k bytes of stack space.
3543   int divisor = sizeof(julong) * 3;
3544   guarantee(longwords <= (8192 / divisor), "must be");
3545   int total_allocation = longwords * sizeof (julong) * 3;
3546   julong *scratch = (julong *)alloca(total_allocation);
3547 
3548   // Local scratch arrays
3549   julong
3550     *a = scratch + 0 * longwords,
3551     *n = scratch + 1 * longwords,
3552     *m = scratch + 2 * longwords;
3553 
3554   reverse_words((julong *)a_ints, a, longwords);
3555   reverse_words((julong *)n_ints, n, longwords);
3556 
3557   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3558     ::montgomery_square(a, n, m, (julong)inv, longwords);
3559   } else {
3560     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3561   }
3562 
3563   reverse_words(m, (julong *)m_ints, longwords);
3564 }
3565 
3566 #ifdef COMPILER2
3567 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3568 //
3569 //------------------------------generate_exception_blob---------------------------
3570 // creates exception blob at the end
3571 // Using exception blob, this code is jumped from a compiled method.
3572 // (see emit_exception_handler in x86_64.ad file)
3573 //
3574 // Given an exception pc at a call we call into the runtime for the
3575 // handler in this method. This handler might merely restore state
3576 // (i.e. callee save registers) unwind the frame and jump to the
3577 // exception handler for the nmethod if there is no Java level handler
3578 // for the nmethod.
3579 //
3580 // This code is entered with a jmp.
3581 //
3582 // Arguments:
3583 //   rax: exception oop
3584 //   rdx: exception pc
3585 //
3586 // Results:
3587 //   rax: exception oop
3588 //   rdx: exception pc in caller or ???
3589 //   destination: exception handler of caller
3590 //
3591 // Note: the exception pc MUST be at a call (precise debug information)
3592 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3593 //
3594 
3595 void OptoRuntime::generate_exception_blob() {
3596   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3597   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3598   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3599 
3600   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3601 
3602   // Allocate space for the code
3603   ResourceMark rm;
3604   // Setup code generation tools
3605   CodeBuffer buffer("exception_blob", 2048, 1024);
3606   MacroAssembler* masm = new MacroAssembler(&buffer);
3607 
3608 
3609   address start = __ pc();
3610 
3611   // Exception pc is 'return address' for stack walker
3612   __ push(rdx);
3613   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3614 
3615   // Save callee-saved registers.  See x86_64.ad.
3616 
3617   // rbp is an implicitly saved callee saved register (i.e., the calling
3618   // convention will save/restore it in the prolog/epilog). Other than that
3619   // there are no callee save registers now that adapter frames are gone.
3620 
3621   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3622 
3623   // Store exception in Thread object. We cannot pass any arguments to the
3624   // handle_exception call, since we do not want to make any assumption
3625   // about the size of the frame where the exception happened in.
3626   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3627   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3628   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3629 
3630   // This call does all the hard work.  It checks if an exception handler
3631   // exists in the method.
3632   // If so, it returns the handler address.
3633   // If not, it prepares for stack-unwinding, restoring the callee-save
3634   // registers of the frame being removed.
3635   //
3636   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3637 
3638   // At a method handle call, the stack may not be properly aligned
3639   // when returning with an exception.
3640   address the_pc = __ pc();
3641   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3642   __ mov(c_rarg0, r15_thread);
3643   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3644   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3645 
3646   // Set an oopmap for the call site.  This oopmap will only be used if we
3647   // are unwinding the stack.  Hence, all locations will be dead.
3648   // Callee-saved registers will be the same as the frame above (i.e.,
3649   // handle_exception_stub), since they were restored when we got the
3650   // exception.
3651 
3652   OopMapSet* oop_maps = new OopMapSet();
3653 
3654   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3655 
3656   __ reset_last_Java_frame(false);
3657 
3658   // Restore callee-saved registers
3659 
3660   // rbp is an implicitly saved callee-saved register (i.e., the calling
3661   // convention will save restore it in prolog/epilog) Other than that
3662   // there are no callee save registers now that adapter frames are gone.
3663 
3664   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3665 
3666   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3667   __ pop(rdx);                  // No need for exception pc anymore
3668 
3669   // rax: exception handler
3670 
3671   // We have a handler in rax (could be deopt blob).
3672   __ mov(r8, rax);
3673 
3674   // Get the exception oop
3675   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3676   // Get the exception pc in case we are deoptimized
3677   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3678 #ifdef ASSERT
3679   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3680   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3681 #endif
3682   // Clear the exception oop so GC no longer processes it as a root.
3683   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3684 
3685   // rax: exception oop
3686   // r8:  exception handler
3687   // rdx: exception pc
3688   // Jump to handler
3689 
3690   __ jmp(r8);
3691 
3692   // Make sure all code is generated
3693   masm->flush();
3694 
3695   // Set exception blob
3696   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3697 }
3698 #endif // COMPILER2
3699