1 /*
   2  * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "classfile/symbolTable.hpp"
  32 #include "code/compiledIC.hpp"
  33 #include "code/debugInfoRec.hpp"
  34 #include "code/icBuffer.hpp"
  35 #include "code/nativeInst.hpp"
  36 #include "code/vtableStubs.hpp"
  37 #include "compiler/oopMap.hpp"
  38 #include "gc/shared/collectedHeap.hpp"
  39 #include "gc/shared/gcLocker.hpp"
  40 #include "gc/shared/barrierSet.hpp"
  41 #include "gc/shared/barrierSetAssembler.hpp"
  42 #include "interpreter/interpreter.hpp"
  43 #include "logging/log.hpp"
  44 #include "memory/resourceArea.hpp"
  45 #include "memory/universe.hpp"
  46 #include "oops/compiledICHolder.hpp"
  47 #include "oops/klass.inline.hpp"
  48 #include "oops/method.inline.hpp"
  49 #include "prims/methodHandles.hpp"
  50 #include "runtime/continuation.hpp"
  51 #include "runtime/continuationEntry.inline.hpp"
  52 #include "runtime/globals.hpp"
  53 #include "runtime/jniHandles.hpp"
  54 #include "runtime/safepointMechanism.hpp"
  55 #include "runtime/sharedRuntime.hpp"
  56 #include "runtime/signature.hpp"
  57 #include "runtime/stubRoutines.hpp"
  58 #include "runtime/vframeArray.hpp"
  59 #include "runtime/vm_version.hpp"
  60 #include "utilities/align.hpp"
  61 #include "utilities/checkedCast.hpp"
  62 #include "utilities/formatBuffer.hpp"
  63 #include "vmreg_x86.inline.hpp"
  64 #ifdef COMPILER1
  65 #include "c1/c1_Runtime1.hpp"
  66 #endif
  67 #ifdef COMPILER2
  68 #include "opto/runtime.hpp"
  69 #endif
  70 #if INCLUDE_JVMCI
  71 #include "jvmci/jvmciJavaClasses.hpp"
  72 #endif
  73 
  74 #define __ masm->
  75 
  76 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  77 
  78 class SimpleRuntimeFrame {
  79 
  80   public:
  81 
  82   // Most of the runtime stubs have this simple frame layout.
  83   // This class exists to make the layout shared in one place.
  84   // Offsets are for compiler stack slots, which are jints.
  85   enum layout {
  86     // The frame sender code expects that rbp will be in the "natural" place and
  87     // will override any oopMap setting for it. We must therefore force the layout
  88     // so that it agrees with the frame sender code.
  89     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  90     rbp_off2,
  91     return_off, return_off2,
  92     framesize
  93   };
  94 };
  95 
  96 class RegisterSaver {
  97   // Capture info about frame layout.  Layout offsets are in jint
  98   // units because compiler frame slots are jints.
  99 #define XSAVE_AREA_BEGIN 160
 100 #define XSAVE_AREA_YMM_BEGIN 576
 101 #define XSAVE_AREA_OPMASK_BEGIN 1088
 102 #define XSAVE_AREA_ZMM_BEGIN 1152
 103 #define XSAVE_AREA_UPPERBANK 1664
 104 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 105 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 106 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 107 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 108 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 109   enum layout {
 110     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 111     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 112     DEF_XMM_OFFS(0),
 113     DEF_XMM_OFFS(1),
 114     // 2..15 are implied in range usage
 115     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 116     DEF_YMM_OFFS(0),
 117     DEF_YMM_OFFS(1),
 118     // 2..15 are implied in range usage
 119     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 120     DEF_OPMASK_OFFS(0),
 121     DEF_OPMASK_OFFS(1),
 122     // 2..7 are implied in range usage
 123     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 124     DEF_ZMM_OFFS(0),
 125     DEF_ZMM_OFFS(1),
 126     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 127     DEF_ZMM_UPPER_OFFS(16),
 128     DEF_ZMM_UPPER_OFFS(17),
 129     // 18..31 are implied in range usage
 130     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 131     fpu_stateH_end,
 132     r15_off, r15H_off,
 133     r14_off, r14H_off,
 134     r13_off, r13H_off,
 135     r12_off, r12H_off,
 136     r11_off, r11H_off,
 137     r10_off, r10H_off,
 138     r9_off,  r9H_off,
 139     r8_off,  r8H_off,
 140     rdi_off, rdiH_off,
 141     rsi_off, rsiH_off,
 142     ignore_off, ignoreH_off,  // extra copy of rbp
 143     rsp_off, rspH_off,
 144     rbx_off, rbxH_off,
 145     rdx_off, rdxH_off,
 146     rcx_off, rcxH_off,
 147     rax_off, raxH_off,
 148     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 149     align_off, alignH_off,
 150     flags_off, flagsH_off,
 151     // The frame sender code expects that rbp will be in the "natural" place and
 152     // will override any oopMap setting for it. We must therefore force the layout
 153     // so that it agrees with the frame sender code.
 154     rbp_off, rbpH_off,        // copy of rbp we will restore
 155     return_off, returnH_off,  // slot for return address
 156     reg_save_size             // size in compiler stack slots
 157   };
 158 
 159  public:
 160   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 161   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 162 
 163   // Offsets into the register save area
 164   // Used by deoptimization when it is managing result register
 165   // values on its own
 166 
 167   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 168   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 169   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 170   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 171   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 172 
 173   // During deoptimization only the result registers need to be restored,
 174   // all the other values have already been extracted.
 175   static void restore_result_registers(MacroAssembler* masm);
 176 };
 177 
 178 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 179   int off = 0;
 180   int num_xmm_regs = XMMRegister::available_xmm_registers();
 181 #if COMPILER2_OR_JVMCI
 182   if (save_wide_vectors && UseAVX == 0) {
 183     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 184   }
 185   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 186 #else
 187   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 188 #endif
 189 
 190   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 191   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 192   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 193   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 194   // CodeBlob frame size is in words.
 195   int frame_size_in_words = frame_size_in_bytes / wordSize;
 196   *total_frame_words = frame_size_in_words;
 197 
 198   // Save registers, fpu state, and flags.
 199   // We assume caller has already pushed the return address onto the
 200   // stack, so rsp is 8-byte aligned here.
 201   // We push rpb twice in this sequence because we want the real rbp
 202   // to be under the return like a normal enter.
 203 
 204   __ enter();          // rsp becomes 16-byte aligned here
 205   __ push_CPU_state(); // Push a multiple of 16 bytes
 206 
 207   // push cpu state handles this on EVEX enabled targets
 208   if (save_wide_vectors) {
 209     // Save upper half of YMM registers(0..15)
 210     int base_addr = XSAVE_AREA_YMM_BEGIN;
 211     for (int n = 0; n < 16; n++) {
 212       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 213     }
 214     if (VM_Version::supports_evex()) {
 215       // Save upper half of ZMM registers(0..15)
 216       base_addr = XSAVE_AREA_ZMM_BEGIN;
 217       for (int n = 0; n < 16; n++) {
 218         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 219       }
 220       // Save full ZMM registers(16..num_xmm_regs)
 221       base_addr = XSAVE_AREA_UPPERBANK;
 222       off = 0;
 223       int vector_len = Assembler::AVX_512bit;
 224       for (int n = 16; n < num_xmm_regs; n++) {
 225         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 226       }
 227 #if COMPILER2_OR_JVMCI
 228       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 229       off = 0;
 230       for(int n = 0; n < KRegister::number_of_registers; n++) {
 231         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 232       }
 233 #endif
 234     }
 235   } else {
 236     if (VM_Version::supports_evex()) {
 237       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 238       int base_addr = XSAVE_AREA_UPPERBANK;
 239       off = 0;
 240       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 241       for (int n = 16; n < num_xmm_regs; n++) {
 242         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 243       }
 244 #if COMPILER2_OR_JVMCI
 245       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 246       off = 0;
 247       for(int n = 0; n < KRegister::number_of_registers; n++) {
 248         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 249       }
 250 #endif
 251     }
 252   }
 253   __ vzeroupper();
 254   if (frame::arg_reg_save_area_bytes != 0) {
 255     // Allocate argument register save area
 256     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 257   }
 258 
 259   // Set an oopmap for the call site.  This oopmap will map all
 260   // oop-registers and debug-info registers as callee-saved.  This
 261   // will allow deoptimization at this safepoint to find all possible
 262   // debug-info recordings, as well as let GC find all oops.
 263 
 264   OopMapSet *oop_maps = new OopMapSet();
 265   OopMap* map = new OopMap(frame_size_in_slots, 0);
 266 
 267 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 268 
 269   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 271   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 272   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 273   // rbp location is known implicitly by the frame sender code, needs no oopmap
 274   // and the location where rbp was saved by is ignored
 275   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 283   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 284   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 285   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 286   // on EVEX enabled targets, we get it included in the xsave area
 287   off = xmm0_off;
 288   int delta = xmm1_off - off;
 289   for (int n = 0; n < 16; n++) {
 290     XMMRegister xmm_name = as_XMMRegister(n);
 291     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 292     off += delta;
 293   }
 294   if (UseAVX > 2) {
 295     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 296     off = zmm16_off;
 297     delta = zmm17_off - off;
 298     for (int n = 16; n < num_xmm_regs; n++) {
 299       XMMRegister zmm_name = as_XMMRegister(n);
 300       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 301       off += delta;
 302     }
 303   }
 304 
 305 #if COMPILER2_OR_JVMCI
 306   if (save_wide_vectors) {
 307     // Save upper half of YMM registers(0..15)
 308     off = ymm0_off;
 309     delta = ymm1_off - ymm0_off;
 310     for (int n = 0; n < 16; n++) {
 311       XMMRegister ymm_name = as_XMMRegister(n);
 312       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 313       off += delta;
 314     }
 315     if (VM_Version::supports_evex()) {
 316       // Save upper half of ZMM registers(0..15)
 317       off = zmm0_off;
 318       delta = zmm1_off - zmm0_off;
 319       for (int n = 0; n < 16; n++) {
 320         XMMRegister zmm_name = as_XMMRegister(n);
 321         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 322         off += delta;
 323       }
 324     }
 325   }
 326 #endif // COMPILER2_OR_JVMCI
 327 
 328   // %%% These should all be a waste but we'll keep things as they were for now
 329   if (true) {
 330     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 334     // rbp location is known implicitly by the frame sender code, needs no oopmap
 335     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 343     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 344     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 345     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 346     // on EVEX enabled targets, we get it included in the xsave area
 347     off = xmm0H_off;
 348     delta = xmm1H_off - off;
 349     for (int n = 0; n < 16; n++) {
 350       XMMRegister xmm_name = as_XMMRegister(n);
 351       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 352       off += delta;
 353     }
 354     if (UseAVX > 2) {
 355       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 356       off = zmm16H_off;
 357       delta = zmm17H_off - off;
 358       for (int n = 16; n < num_xmm_regs; n++) {
 359         XMMRegister zmm_name = as_XMMRegister(n);
 360         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 361         off += delta;
 362       }
 363     }
 364   }
 365 
 366   return map;
 367 }
 368 
 369 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 370   int num_xmm_regs = XMMRegister::available_xmm_registers();
 371   if (frame::arg_reg_save_area_bytes != 0) {
 372     // Pop arg register save area
 373     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 374   }
 375 
 376 #if COMPILER2_OR_JVMCI
 377   if (restore_wide_vectors) {
 378     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 379     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 380   }
 381 #else
 382   assert(!restore_wide_vectors, "vectors are generated only by C2");
 383 #endif
 384 
 385   __ vzeroupper();
 386 
 387   // On EVEX enabled targets everything is handled in pop fpu state
 388   if (restore_wide_vectors) {
 389     // Restore upper half of YMM registers (0..15)
 390     int base_addr = XSAVE_AREA_YMM_BEGIN;
 391     for (int n = 0; n < 16; n++) {
 392       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 393     }
 394     if (VM_Version::supports_evex()) {
 395       // Restore upper half of ZMM registers (0..15)
 396       base_addr = XSAVE_AREA_ZMM_BEGIN;
 397       for (int n = 0; n < 16; n++) {
 398         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 399       }
 400       // Restore full ZMM registers(16..num_xmm_regs)
 401       base_addr = XSAVE_AREA_UPPERBANK;
 402       int vector_len = Assembler::AVX_512bit;
 403       int off = 0;
 404       for (int n = 16; n < num_xmm_regs; n++) {
 405         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 406       }
 407 #if COMPILER2_OR_JVMCI
 408       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 409       off = 0;
 410       for (int n = 0; n < KRegister::number_of_registers; n++) {
 411         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 412       }
 413 #endif
 414     }
 415   } else {
 416     if (VM_Version::supports_evex()) {
 417       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 418       int base_addr = XSAVE_AREA_UPPERBANK;
 419       int off = 0;
 420       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 421       for (int n = 16; n < num_xmm_regs; n++) {
 422         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 423       }
 424 #if COMPILER2_OR_JVMCI
 425       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 426       off = 0;
 427       for (int n = 0; n < KRegister::number_of_registers; n++) {
 428         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 429       }
 430 #endif
 431     }
 432   }
 433 
 434   // Recover CPU state
 435   __ pop_CPU_state();
 436   // Get the rbp described implicitly by the calling convention (no oopMap)
 437   __ pop(rbp);
 438 }
 439 
 440 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 441 
 442   // Just restore result register. Only used by deoptimization. By
 443   // now any callee save register that needs to be restored to a c2
 444   // caller of the deoptee has been extracted into the vframeArray
 445   // and will be stuffed into the c2i adapter we create for later
 446   // restoration so only result registers need to be restored here.
 447 
 448   // Restore fp result register
 449   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 450   // Restore integer result register
 451   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 452   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 453 
 454   // Pop all of the register save are off the stack except the return address
 455   __ addptr(rsp, return_offset_in_bytes());
 456 }
 457 
 458 // Is vector's size (in bytes) bigger than a size saved by default?
 459 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 460 bool SharedRuntime::is_wide_vector(int size) {
 461   return size > 16;
 462 }
 463 
 464 // ---------------------------------------------------------------------------
 465 // Read the array of BasicTypes from a signature, and compute where the
 466 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 467 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 468 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 469 // as framesizes are fixed.
 470 // VMRegImpl::stack0 refers to the first slot 0(sp).
 471 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 472 // Register up to Register::number_of_registers are the 64-bit
 473 // integer registers.
 474 
 475 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 476 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 477 // units regardless of build. Of course for i486 there is no 64 bit build
 478 
 479 // The Java calling convention is a "shifted" version of the C ABI.
 480 // By skipping the first C ABI register we can call non-static jni methods
 481 // with small numbers of arguments without having to shuffle the arguments
 482 // at all. Since we control the java ABI we ought to at least get some
 483 // advantage out of it.
 484 
 485 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 486                                            VMRegPair *regs,
 487                                            int total_args_passed) {
 488 
 489   // Create the mapping between argument positions and
 490   // registers.
 491   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 492     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 493   };
 494   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 495     j_farg0, j_farg1, j_farg2, j_farg3,
 496     j_farg4, j_farg5, j_farg6, j_farg7
 497   };
 498 
 499 
 500   uint int_args = 0;
 501   uint fp_args = 0;
 502   uint stk_args = 0; // inc by 2 each time
 503 
 504   for (int i = 0; i < total_args_passed; i++) {
 505     switch (sig_bt[i]) {
 506     case T_BOOLEAN:
 507     case T_CHAR:
 508     case T_BYTE:
 509     case T_SHORT:
 510     case T_INT:
 511       if (int_args < Argument::n_int_register_parameters_j) {
 512         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 513       } else {
 514         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 515         stk_args += 2;
 516       }
 517       break;
 518     case T_VOID:
 519       // halves of T_LONG or T_DOUBLE
 520       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 521       regs[i].set_bad();
 522       break;
 523     case T_LONG:
 524       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 525       // fall through
 526     case T_OBJECT:
 527     case T_ARRAY:
 528     case T_ADDRESS:
 529       if (int_args < Argument::n_int_register_parameters_j) {
 530         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 531       } else {
 532         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 533         stk_args += 2;
 534       }
 535       break;
 536     case T_FLOAT:
 537       if (fp_args < Argument::n_float_register_parameters_j) {
 538         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 539       } else {
 540         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 541         stk_args += 2;
 542       }
 543       break;
 544     case T_DOUBLE:
 545       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 546       if (fp_args < Argument::n_float_register_parameters_j) {
 547         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 548       } else {
 549         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 550         stk_args += 2;
 551       }
 552       break;
 553     default:
 554       ShouldNotReachHere();
 555       break;
 556     }
 557   }
 558 
 559   return align_up(stk_args, 2);
 560 }
 561 
 562 // Same as java_calling_convention() but for multiple return
 563 // values. There's no way to store them on the stack so if we don't
 564 // have enough registers, multiple values can't be returned.
 565 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 566 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 567 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 568                                           VMRegPair *regs,
 569                                           int total_args_passed) {
 570   // Create the mapping between argument positions and
 571   // registers.
 572   static const Register INT_ArgReg[java_return_convention_max_int] = {
 573     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 574   };
 575   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 576     j_farg0, j_farg1, j_farg2, j_farg3,
 577     j_farg4, j_farg5, j_farg6, j_farg7
 578   };
 579 
 580 
 581   uint int_args = 0;
 582   uint fp_args = 0;
 583 
 584   for (int i = 0; i < total_args_passed; i++) {
 585     switch (sig_bt[i]) {
 586     case T_BOOLEAN:
 587     case T_CHAR:
 588     case T_BYTE:
 589     case T_SHORT:
 590     case T_INT:
 591       if (int_args < Argument::n_int_register_parameters_j+1) {
 592         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 593         int_args++;
 594       } else {
 595         return -1;
 596       }
 597       break;
 598     case T_VOID:
 599       // halves of T_LONG or T_DOUBLE
 600       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 601       regs[i].set_bad();
 602       break;
 603     case T_LONG:
 604       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 605       // fall through
 606     case T_OBJECT:
 607     case T_ARRAY:
 608     case T_ADDRESS:
 609     case T_METADATA:
 610       if (int_args < Argument::n_int_register_parameters_j+1) {
 611         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 612         int_args++;
 613       } else {
 614         return -1;
 615       }
 616       break;
 617     case T_FLOAT:
 618       if (fp_args < Argument::n_float_register_parameters_j) {
 619         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 620         fp_args++;
 621       } else {
 622         return -1;
 623       }
 624       break;
 625     case T_DOUBLE:
 626       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 627       if (fp_args < Argument::n_float_register_parameters_j) {
 628         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 629         fp_args++;
 630       } else {
 631         return -1;
 632       }
 633       break;
 634     default:
 635       ShouldNotReachHere();
 636       break;
 637     }
 638   }
 639 
 640   return int_args + fp_args;
 641 }
 642 
 643 // Patch the callers callsite with entry to compiled code if it exists.
 644 static void patch_callers_callsite(MacroAssembler *masm) {
 645   Label L;
 646   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 647   __ jcc(Assembler::equal, L);
 648 
 649   // Save the current stack pointer
 650   __ mov(r13, rsp);
 651   // Schedule the branch target address early.
 652   // Call into the VM to patch the caller, then jump to compiled callee
 653   // rax isn't live so capture return address while we easily can
 654   __ movptr(rax, Address(rsp, 0));
 655 
 656   // align stack so push_CPU_state doesn't fault
 657   __ andptr(rsp, -(StackAlignmentInBytes));
 658   __ push_CPU_state();
 659   __ vzeroupper();
 660   // VM needs caller's callsite
 661   // VM needs target method
 662   // This needs to be a long call since we will relocate this adapter to
 663   // the codeBuffer and it may not reach
 664 
 665   // Allocate argument register save area
 666   if (frame::arg_reg_save_area_bytes != 0) {
 667     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 668   }
 669   __ mov(c_rarg0, rbx);
 670   __ mov(c_rarg1, rax);
 671   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 672 
 673   // De-allocate argument register save area
 674   if (frame::arg_reg_save_area_bytes != 0) {
 675     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 676   }
 677 
 678   __ vzeroupper();
 679   __ pop_CPU_state();
 680   // restore sp
 681   __ mov(rsp, r13);
 682   __ bind(L);
 683 }
 684 
 685 // For each inline type argument, sig includes the list of fields of
 686 // the inline type. This utility function computes the number of
 687 // arguments for the call if inline types are passed by reference (the
 688 // calling convention the interpreter expects).
 689 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 690   int total_args_passed = 0;
 691   if (InlineTypePassFieldsAsArgs) {
 692     for (int i = 0; i < sig_extended->length(); i++) {
 693       BasicType bt = sig_extended->at(i)._bt;
 694       if (bt == T_METADATA) {
 695         // In sig_extended, an inline type argument starts with:
 696         // T_METADATA, followed by the types of the fields of the
 697         // inline type and T_VOID to mark the end of the value
 698         // type. Inline types are flattened so, for instance, in the
 699         // case of an inline type with an int field and an inline type
 700         // field that itself has 2 fields, an int and a long:
 701         // T_METADATA T_INT T_METADATA T_INT T_LONG T_VOID (second
 702         // slot for the T_LONG) T_VOID (inner inline type) T_VOID
 703         // (outer inline type)
 704         total_args_passed++;
 705         int vt = 1;
 706         do {
 707           i++;
 708           BasicType bt = sig_extended->at(i)._bt;
 709           BasicType prev_bt = sig_extended->at(i-1)._bt;
 710           if (bt == T_METADATA) {
 711             vt++;
 712           } else if (bt == T_VOID &&
 713                      prev_bt != T_LONG &&
 714                      prev_bt != T_DOUBLE) {
 715             vt--;
 716           }
 717         } while (vt != 0);
 718       } else {
 719         total_args_passed++;
 720       }
 721     }
 722   } else {
 723     total_args_passed = sig_extended->length();
 724   }
 725   return total_args_passed;
 726 }
 727 
 728 
 729 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 730                                    BasicType bt,
 731                                    BasicType prev_bt,
 732                                    size_t size_in_bytes,
 733                                    const VMRegPair& reg_pair,
 734                                    const Address& to,
 735                                    int extraspace,
 736                                    bool is_oop) {
 737   if (bt == T_VOID) {
 738     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 739     return;
 740   }
 741 
 742   // Say 4 args:
 743   // i   st_off
 744   // 0   32 T_LONG
 745   // 1   24 T_VOID
 746   // 2   16 T_OBJECT
 747   // 3    8 T_BOOL
 748   // -    0 return address
 749   //
 750   // However to make thing extra confusing. Because we can fit a long/double in
 751   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 752   // leaves one slot empty and only stores to a single slot. In this case the
 753   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 754 
 755   bool wide = (size_in_bytes == wordSize);
 756   VMReg r_1 = reg_pair.first();
 757   VMReg r_2 = reg_pair.second();
 758   assert(r_2->is_valid() == wide, "invalid size");
 759   if (!r_1->is_valid()) {
 760     assert(!r_2->is_valid(), "must be invalid");
 761     return;
 762   }
 763 
 764   if (!r_1->is_XMMRegister()) {
 765     Register val = rax;
 766     if (r_1->is_stack()) {
 767       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 768       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 769     } else {
 770       val = r_1->as_Register();
 771     }
 772     assert_different_registers(to.base(), val, rscratch1);
 773     if (is_oop) {
 774       __ push(r13);
 775       __ push(rbx);
 776       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 777       __ pop(rbx);
 778       __ pop(r13);
 779     } else {
 780       __ store_sized_value(to, val, size_in_bytes);
 781     }
 782   } else {
 783     if (wide) {
 784       __ movdbl(to, r_1->as_XMMRegister());
 785     } else {
 786       __ movflt(to, r_1->as_XMMRegister());
 787     }
 788   }
 789 }
 790 
 791 static void gen_c2i_adapter(MacroAssembler *masm,
 792                             const GrowableArray<SigEntry>* sig_extended,
 793                             const VMRegPair *regs,
 794                             bool requires_clinit_barrier,
 795                             address& c2i_no_clinit_check_entry,
 796                             Label& skip_fixup,
 797                             address start,
 798                             OopMapSet* oop_maps,
 799                             int& frame_complete,
 800                             int& frame_size_in_words,
 801                             bool alloc_inline_receiver) {
 802   if (requires_clinit_barrier && VM_Version::supports_fast_class_init_checks()) {
 803     Label L_skip_barrier;
 804     Register method = rbx;
 805 
 806     { // Bypass the barrier for non-static methods
 807       Register flags = rscratch1;
 808       __ movl(flags, Address(method, Method::access_flags_offset()));
 809       __ testl(flags, JVM_ACC_STATIC);
 810       __ jcc(Assembler::zero, L_skip_barrier); // non-static
 811     }
 812 
 813     Register klass = rscratch1;
 814     __ load_method_holder(klass, method);
 815     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
 816 
 817     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
 818 
 819     __ bind(L_skip_barrier);
 820     c2i_no_clinit_check_entry = __ pc();
 821   }
 822 
 823   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 824   bs->c2i_entry_barrier(masm);
 825 
 826   // Before we get into the guts of the C2I adapter, see if we should be here
 827   // at all.  We've come from compiled code and are attempting to jump to the
 828   // interpreter, which means the caller made a static call to get here
 829   // (vcalls always get a compiled target if there is one).  Check for a
 830   // compiled target.  If there is one, we need to patch the caller's call.
 831   patch_callers_callsite(masm);
 832 
 833   __ bind(skip_fixup);
 834 
 835   if (InlineTypePassFieldsAsArgs) {
 836     // Is there an inline type argument?
 837     bool has_inline_argument = false;
 838     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 839       has_inline_argument = (sig_extended->at(i)._bt == T_METADATA);
 840     }
 841     if (has_inline_argument) {
 842       // There is at least an inline type argument: we're coming from
 843       // compiled code so we have no buffers to back the inline types.
 844       // Allocate the buffers here with a runtime call.
 845       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
 846 
 847       frame_complete = __ offset();
 848 
 849       __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
 850 
 851       __ mov(c_rarg0, r15_thread);
 852       __ mov(c_rarg1, rbx);
 853       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 854       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 855 
 856       oop_maps->add_gc_map((int)(__ pc() - start), map);
 857       __ reset_last_Java_frame(false);
 858 
 859       RegisterSaver::restore_live_registers(masm);
 860 
 861       Label no_exception;
 862       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 863       __ jcc(Assembler::equal, no_exception);
 864 
 865       __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
 866       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 867       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 868 
 869       __ bind(no_exception);
 870 
 871       // We get an array of objects from the runtime call
 872       __ get_vm_result(rscratch2, r15_thread); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 873       __ get_vm_result_2(rbx, r15_thread); // TODO: required to keep the callee Method live?
 874     }
 875   }
 876 
 877   // Since all args are passed on the stack, total_args_passed *
 878   // Interpreter::stackElementSize is the space we need.
 879   int total_args_passed = compute_total_args_passed_int(sig_extended);
 880   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 881 
 882   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 883 
 884   // stack is aligned, keep it that way
 885   // This is not currently needed or enforced by the interpreter, but
 886   // we might as well conform to the ABI.
 887   extraspace = align_up(extraspace, 2*wordSize);
 888 
 889   // set senderSP value
 890   __ lea(r13, Address(rsp, wordSize));
 891 
 892 #ifdef ASSERT
 893   __ check_stack_alignment(r13, "sender stack not aligned");
 894 #endif
 895   if (extraspace > 0) {
 896     // Pop the return address
 897     __ pop(rax);
 898 
 899     __ subptr(rsp, extraspace);
 900 
 901     // Push the return address
 902     __ push(rax);
 903 
 904     // Account for the return address location since we store it first rather
 905     // than hold it in a register across all the shuffling
 906     extraspace += wordSize;
 907   }
 908 
 909 #ifdef ASSERT
 910   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 911 #endif
 912 
 913   // Now write the args into the outgoing interpreter space
 914 
 915   // next_arg_comp is the next argument from the compiler point of
 916   // view (inline type fields are passed in registers/on the stack). In
 917   // sig_extended, an inline type argument starts with: T_METADATA,
 918   // followed by the types of the fields of the inline type and T_VOID
 919   // to mark the end of the inline type. ignored counts the number of
 920   // T_METADATA/T_VOID. next_vt_arg is the next inline type argument:
 921   // used to get the buffer for that argument from the pool of buffers
 922   // we allocated above and want to pass to the
 923   // interpreter. next_arg_int is the next argument from the
 924   // interpreter point of view (inline types are passed by reference).
 925   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
 926        next_arg_comp < sig_extended->length(); next_arg_comp++) {
 927     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
 928     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
 929     BasicType bt = sig_extended->at(next_arg_comp)._bt;
 930     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
 931     if (!InlineTypePassFieldsAsArgs || bt != T_METADATA) {
 932       int next_off = st_off - Interpreter::stackElementSize;
 933       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
 934       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
 935       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
 936       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
 937                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
 938       next_arg_int++;
 939 #ifdef ASSERT
 940       if (bt == T_LONG || bt == T_DOUBLE) {
 941         // Overwrite the unused slot with known junk
 942         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 943         __ movptr(Address(rsp, st_off), rax);
 944       }
 945 #endif /* ASSERT */
 946     } else {
 947       ignored++;
 948       // get the buffer from the just allocated pool of buffers
 949       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_OBJECT);
 950       __ load_heap_oop(r14, Address(rscratch2, index));
 951       next_vt_arg++; next_arg_int++;
 952       int vt = 1;
 953       // write fields we get from compiled code in registers/stack
 954       // slots to the buffer: we know we are done with that inline type
 955       // argument when we hit the T_VOID that acts as an end of inline
 956       // type delimiter for this inline type. Inline types are flattened
 957       // so we might encounter embedded inline types. Each entry in
 958       // sig_extended contains a field offset in the buffer.
 959       Label L_null;
 960       do {
 961         next_arg_comp++;
 962         BasicType bt = sig_extended->at(next_arg_comp)._bt;
 963         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
 964         if (bt == T_METADATA) {
 965           vt++;
 966           ignored++;
 967         } else if (bt == T_VOID &&
 968                    prev_bt != T_LONG &&
 969                    prev_bt != T_DOUBLE) {
 970           vt--;
 971           ignored++;
 972         } else {
 973           int off = sig_extended->at(next_arg_comp)._offset;
 974           if (off == -1) {
 975             // Nullable inline type argument, emit null check
 976             VMReg reg = regs[next_arg_comp-ignored].first();
 977             Label L_notNull;
 978             if (reg->is_stack()) {
 979               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 980               __ testb(Address(rsp, ld_off), 1);
 981             } else {
 982               __ testb(reg->as_Register(), 1);
 983             }
 984             __ jcc(Assembler::notZero, L_notNull);
 985             __ movptr(Address(rsp, st_off), 0);
 986             __ jmp(L_null);
 987             __ bind(L_notNull);
 988             continue;
 989           }
 990           assert(off > 0, "offset in object should be positive");
 991           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
 992           bool is_oop = is_reference_type(bt);
 993           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
 994                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
 995         }
 996       } while (vt != 0);
 997       // pass the buffer to the interpreter
 998       __ movptr(Address(rsp, st_off), r14);
 999       __ bind(L_null);
1000     }
1001   }
1002 
1003   // Schedule the branch target address early.
1004   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1005   __ jmp(rcx);
1006 }
1007 
1008 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
1009                         address code_start, address code_end,
1010                         Label& L_ok) {
1011   Label L_fail;
1012   __ lea(temp_reg, ExternalAddress(code_start));
1013   __ cmpptr(pc_reg, temp_reg);
1014   __ jcc(Assembler::belowEqual, L_fail);
1015   __ lea(temp_reg, ExternalAddress(code_end));
1016   __ cmpptr(pc_reg, temp_reg);
1017   __ jcc(Assembler::below, L_ok);
1018   __ bind(L_fail);
1019 }
1020 
1021 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1022                                     int comp_args_on_stack,
1023                                     const GrowableArray<SigEntry>* sig,
1024                                     const VMRegPair *regs) {
1025 
1026   // Note: r13 contains the senderSP on entry. We must preserve it since
1027   // we may do a i2c -> c2i transition if we lose a race where compiled
1028   // code goes non-entrant while we get args ready.
1029   // In addition we use r13 to locate all the interpreter args as
1030   // we must align the stack to 16 bytes on an i2c entry else we
1031   // lose alignment we expect in all compiled code and register
1032   // save code can segv when fxsave instructions find improperly
1033   // aligned stack pointer.
1034 
1035   // Adapters can be frameless because they do not require the caller
1036   // to perform additional cleanup work, such as correcting the stack pointer.
1037   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1038   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1039   // even if a callee has modified the stack pointer.
1040   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1041   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1042   // up via the senderSP register).
1043   // In other words, if *either* the caller or callee is interpreted, we can
1044   // get the stack pointer repaired after a call.
1045   // This is why c2i and i2c adapters cannot be indefinitely composed.
1046   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1047   // both caller and callee would be compiled methods, and neither would
1048   // clean up the stack pointer changes performed by the two adapters.
1049   // If this happens, control eventually transfers back to the compiled
1050   // caller, but with an uncorrected stack, causing delayed havoc.
1051 
1052   if (VerifyAdapterCalls &&
1053       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
1054     // So, let's test for cascading c2i/i2c adapters right now.
1055     //  assert(Interpreter::contains($return_addr) ||
1056     //         StubRoutines::contains($return_addr),
1057     //         "i2c adapter must return to an interpreter frame");
1058     __ block_comment("verify_i2c { ");
1059     // Pick up the return address
1060     __ movptr(rax, Address(rsp, 0));
1061     Label L_ok;
1062     if (Interpreter::code() != nullptr) {
1063       range_check(masm, rax, r11,
1064                   Interpreter::code()->code_start(),
1065                   Interpreter::code()->code_end(),
1066                   L_ok);
1067     }
1068     if (StubRoutines::initial_stubs_code() != nullptr) {
1069       range_check(masm, rax, r11,
1070                   StubRoutines::initial_stubs_code()->code_begin(),
1071                   StubRoutines::initial_stubs_code()->code_end(),
1072                   L_ok);
1073     }
1074     if (StubRoutines::final_stubs_code() != nullptr) {
1075       range_check(masm, rax, r11,
1076                   StubRoutines::final_stubs_code()->code_begin(),
1077                   StubRoutines::final_stubs_code()->code_end(),
1078                   L_ok);
1079     }
1080     const char* msg = "i2c adapter must return to an interpreter frame";
1081     __ block_comment(msg);
1082     __ stop(msg);
1083     __ bind(L_ok);
1084     __ block_comment("} verify_i2ce ");
1085   }
1086 
1087   // Must preserve original SP for loading incoming arguments because
1088   // we need to align the outgoing SP for compiled code.
1089   __ movptr(r11, rsp);
1090 
1091   // Pick up the return address
1092   __ pop(rax);
1093 
1094   // Convert 4-byte c2 stack slots to words.
1095   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1096 
1097   if (comp_args_on_stack) {
1098     __ subptr(rsp, comp_words_on_stack * wordSize);
1099   }
1100 
1101   // Ensure compiled code always sees stack at proper alignment
1102   __ andptr(rsp, -16);
1103 
1104   // push the return address and misalign the stack that youngest frame always sees
1105   // as far as the placement of the call instruction
1106   __ push(rax);
1107 
1108   // Put saved SP in another register
1109   const Register saved_sp = rax;
1110   __ movptr(saved_sp, r11);
1111 
1112   // Will jump to the compiled code just as if compiled code was doing it.
1113   // Pre-load the register-jump target early, to schedule it better.
1114   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1115 
1116 #if INCLUDE_JVMCI
1117   if (EnableJVMCI) {
1118     // check if this call should be routed towards a specific entry point
1119     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1120     Label no_alternative_target;
1121     __ jcc(Assembler::equal, no_alternative_target);
1122     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1123     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1124     __ bind(no_alternative_target);
1125   }
1126 #endif // INCLUDE_JVMCI
1127 
1128   int total_args_passed = sig->length();
1129 
1130   // Now generate the shuffle code.  Pick up all register args and move the
1131   // rest through the floating point stack top.
1132   for (int i = 0; i < total_args_passed; i++) {
1133     BasicType bt = sig->at(i)._bt;
1134     if (bt == T_VOID) {
1135       // Longs and doubles are passed in native word order, but misaligned
1136       // in the 32-bit build.
1137       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1138       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1139       continue;
1140     }
1141 
1142     // Pick up 0, 1 or 2 words from SP+offset.
1143 
1144     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1145             "scrambled load targets?");
1146     // Load in argument order going down.
1147     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1148     // Point to interpreter value (vs. tag)
1149     int next_off = ld_off - Interpreter::stackElementSize;
1150     //
1151     //
1152     //
1153     VMReg r_1 = regs[i].first();
1154     VMReg r_2 = regs[i].second();
1155     if (!r_1->is_valid()) {
1156       assert(!r_2->is_valid(), "");
1157       continue;
1158     }
1159     if (r_1->is_stack()) {
1160       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1161       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1162 
1163       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1164       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1165       // will be generated.
1166       if (!r_2->is_valid()) {
1167         // sign extend???
1168         __ movl(r13, Address(saved_sp, ld_off));
1169         __ movptr(Address(rsp, st_off), r13);
1170       } else {
1171         //
1172         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1173         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1174         // So we must adjust where to pick up the data to match the interpreter.
1175         //
1176         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1177         // are accessed as negative so LSW is at LOW address
1178 
1179         // ld_off is MSW so get LSW
1180         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1181                            next_off : ld_off;
1182         __ movq(r13, Address(saved_sp, offset));
1183         // st_off is LSW (i.e. reg.first())
1184         __ movq(Address(rsp, st_off), r13);
1185       }
1186     } else if (r_1->is_Register()) {  // Register argument
1187       Register r = r_1->as_Register();
1188       assert(r != rax, "must be different");
1189       if (r_2->is_valid()) {
1190         //
1191         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1192         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1193         // So we must adjust where to pick up the data to match the interpreter.
1194 
1195         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1196                            next_off : ld_off;
1197 
1198         // this can be a misaligned move
1199         __ movq(r, Address(saved_sp, offset));
1200       } else {
1201         // sign extend and use a full word?
1202         __ movl(r, Address(saved_sp, ld_off));
1203       }
1204     } else {
1205       if (!r_2->is_valid()) {
1206         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1207       } else {
1208         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1209       }
1210     }
1211   }
1212 
1213   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1214 
1215   // 6243940 We might end up in handle_wrong_method if
1216   // the callee is deoptimized as we race thru here. If that
1217   // happens we don't want to take a safepoint because the
1218   // caller frame will look interpreted and arguments are now
1219   // "compiled" so it is much better to make this transition
1220   // invisible to the stack walking code. Unfortunately if
1221   // we try and find the callee by normal means a safepoint
1222   // is possible. So we stash the desired callee in the thread
1223   // and the vm will find there should this case occur.
1224 
1225   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1226 
1227   // put Method* where a c2i would expect should we end up there
1228   // only needed because of c2 resolve stubs return Method* as a result in
1229   // rax
1230   __ mov(rax, rbx);
1231   __ jmp(r11);
1232 }
1233 
1234 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1235   Label ok;
1236 
1237   Register holder = rax;
1238   Register receiver = j_rarg0;
1239   Register temp = rbx;
1240 
1241   __ load_klass(temp, receiver, rscratch1);
1242   __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
1243   __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
1244   __ jcc(Assembler::equal, ok);
1245   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1246 
1247   __ bind(ok);
1248   // Method might have been compiled since the call site was patched to
1249   // interpreted if that is the case treat it as a miss so we can get
1250   // the call site corrected.
1251   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1252   __ jcc(Assembler::equal, skip_fixup);
1253   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1254 }
1255 
1256 // ---------------------------------------------------------------
1257 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1258                                                             int comp_args_on_stack,
1259                                                             const GrowableArray<SigEntry>* sig,
1260                                                             const VMRegPair* regs,
1261                                                             const GrowableArray<SigEntry>* sig_cc,
1262                                                             const VMRegPair* regs_cc,
1263                                                             const GrowableArray<SigEntry>* sig_cc_ro,
1264                                                             const VMRegPair* regs_cc_ro,
1265                                                             AdapterFingerPrint* fingerprint,
1266                                                             AdapterBlob*& new_adapter,
1267                                                             bool allocate_code_blob) {
1268   address i2c_entry = __ pc();
1269   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1270 
1271   // -------------------------------------------------------------------------
1272   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1273   // to the interpreter.  The args start out packed in the compiled layout.  They
1274   // need to be unpacked into the interpreter layout.  This will almost always
1275   // require some stack space.  We grow the current (compiled) stack, then repack
1276   // the args.  We  finally end in a jump to the generic interpreter entry point.
1277   // On exit from the interpreter, the interpreter will restore our SP (lest the
1278   // compiled code, which relies solely on SP and not RBP, get sick).
1279 
1280   address c2i_unverified_entry        = __ pc();
1281   address c2i_unverified_inline_entry = __ pc();
1282   Label skip_fixup;
1283 
1284   gen_inline_cache_check(masm, skip_fixup);
1285 
1286   OopMapSet* oop_maps = new OopMapSet();
1287   int frame_complete = CodeOffsets::frame_never_safe;
1288   int frame_size_in_words = 0;
1289 
1290   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1291   address c2i_no_clinit_check_entry = nullptr;
1292   address c2i_inline_ro_entry = __ pc();
1293   if (regs_cc != regs_cc_ro) {
1294     // No class init barrier needed because method is guaranteed to be non-static
1295     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, c2i_no_clinit_check_entry,
1296                     skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1297     skip_fixup.reset();
1298   }
1299 
1300   // Scalarized c2i adapter
1301   address c2i_entry        = __ pc();
1302   address c2i_inline_entry = __ pc();
1303   gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1304                   skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1305 
1306   // Non-scalarized c2i adapter
1307   if (regs != regs_cc) {
1308     c2i_unverified_inline_entry = __ pc();
1309     Label inline_entry_skip_fixup;
1310     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1311 
1312     c2i_inline_entry = __ pc();
1313     gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1314                     inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1315   }
1316 
1317 
1318   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1319   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1320   if (allocate_code_blob) {
1321     bool caller_must_gc_arguments = (regs != regs_cc);
1322     new_adapter = AdapterBlob::create(masm->code(), frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1323   }
1324 
1325   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry, c2i_unverified_inline_entry, c2i_no_clinit_check_entry);
1326 }
1327 
1328 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1329                                          VMRegPair *regs,
1330                                          int total_args_passed) {
1331 
1332 // We return the amount of VMRegImpl stack slots we need to reserve for all
1333 // the arguments NOT counting out_preserve_stack_slots.
1334 
1335 // NOTE: These arrays will have to change when c1 is ported
1336 #ifdef _WIN64
1337     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1338       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1339     };
1340     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1341       c_farg0, c_farg1, c_farg2, c_farg3
1342     };
1343 #else
1344     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1345       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1346     };
1347     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1348       c_farg0, c_farg1, c_farg2, c_farg3,
1349       c_farg4, c_farg5, c_farg6, c_farg7
1350     };
1351 #endif // _WIN64
1352 
1353 
1354     uint int_args = 0;
1355     uint fp_args = 0;
1356     uint stk_args = 0; // inc by 2 each time
1357 
1358     for (int i = 0; i < total_args_passed; i++) {
1359       switch (sig_bt[i]) {
1360       case T_BOOLEAN:
1361       case T_CHAR:
1362       case T_BYTE:
1363       case T_SHORT:
1364       case T_INT:
1365         if (int_args < Argument::n_int_register_parameters_c) {
1366           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1367 #ifdef _WIN64
1368           fp_args++;
1369           // Allocate slots for callee to stuff register args the stack.
1370           stk_args += 2;
1371 #endif
1372         } else {
1373           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1374           stk_args += 2;
1375         }
1376         break;
1377       case T_LONG:
1378         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1379         // fall through
1380       case T_OBJECT:
1381       case T_ARRAY:
1382       case T_ADDRESS:
1383       case T_METADATA:
1384         if (int_args < Argument::n_int_register_parameters_c) {
1385           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1386 #ifdef _WIN64
1387           fp_args++;
1388           stk_args += 2;
1389 #endif
1390         } else {
1391           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1392           stk_args += 2;
1393         }
1394         break;
1395       case T_FLOAT:
1396         if (fp_args < Argument::n_float_register_parameters_c) {
1397           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1398 #ifdef _WIN64
1399           int_args++;
1400           // Allocate slots for callee to stuff register args the stack.
1401           stk_args += 2;
1402 #endif
1403         } else {
1404           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1405           stk_args += 2;
1406         }
1407         break;
1408       case T_DOUBLE:
1409         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1410         if (fp_args < Argument::n_float_register_parameters_c) {
1411           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1412 #ifdef _WIN64
1413           int_args++;
1414           // Allocate slots for callee to stuff register args the stack.
1415           stk_args += 2;
1416 #endif
1417         } else {
1418           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1419           stk_args += 2;
1420         }
1421         break;
1422       case T_VOID: // Halves of longs and doubles
1423         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1424         regs[i].set_bad();
1425         break;
1426       default:
1427         ShouldNotReachHere();
1428         break;
1429       }
1430     }
1431 #ifdef _WIN64
1432   // windows abi requires that we always allocate enough stack space
1433   // for 4 64bit registers to be stored down.
1434   if (stk_args < 8) {
1435     stk_args = 8;
1436   }
1437 #endif // _WIN64
1438 
1439   return stk_args;
1440 }
1441 
1442 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1443                                              uint num_bits,
1444                                              uint total_args_passed) {
1445   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1446          "only certain vector sizes are supported for now");
1447 
1448   static const XMMRegister VEC_ArgReg[32] = {
1449      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1450      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1451     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1452     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1453   };
1454 
1455   uint stk_args = 0;
1456   uint fp_args = 0;
1457 
1458   for (uint i = 0; i < total_args_passed; i++) {
1459     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1460     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1461     regs[i].set_pair(vmreg->next(next_val), vmreg);
1462   }
1463 
1464   return stk_args;
1465 }
1466 
1467 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1468   // We always ignore the frame_slots arg and just use the space just below frame pointer
1469   // which by this time is free to use
1470   switch (ret_type) {
1471   case T_FLOAT:
1472     __ movflt(Address(rbp, -wordSize), xmm0);
1473     break;
1474   case T_DOUBLE:
1475     __ movdbl(Address(rbp, -wordSize), xmm0);
1476     break;
1477   case T_VOID:  break;
1478   default: {
1479     __ movptr(Address(rbp, -wordSize), rax);
1480     }
1481   }
1482 }
1483 
1484 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1485   // We always ignore the frame_slots arg and just use the space just below frame pointer
1486   // which by this time is free to use
1487   switch (ret_type) {
1488   case T_FLOAT:
1489     __ movflt(xmm0, Address(rbp, -wordSize));
1490     break;
1491   case T_DOUBLE:
1492     __ movdbl(xmm0, Address(rbp, -wordSize));
1493     break;
1494   case T_VOID:  break;
1495   default: {
1496     __ movptr(rax, Address(rbp, -wordSize));
1497     }
1498   }
1499 }
1500 
1501 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1502     for ( int i = first_arg ; i < arg_count ; i++ ) {
1503       if (args[i].first()->is_Register()) {
1504         __ push(args[i].first()->as_Register());
1505       } else if (args[i].first()->is_XMMRegister()) {
1506         __ subptr(rsp, 2*wordSize);
1507         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1508       }
1509     }
1510 }
1511 
1512 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1513     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1514       if (args[i].first()->is_Register()) {
1515         __ pop(args[i].first()->as_Register());
1516       } else if (args[i].first()->is_XMMRegister()) {
1517         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1518         __ addptr(rsp, 2*wordSize);
1519       }
1520     }
1521 }
1522 
1523 static void verify_oop_args(MacroAssembler* masm,
1524                             const methodHandle& method,
1525                             const BasicType* sig_bt,
1526                             const VMRegPair* regs) {
1527   Register temp_reg = rbx;  // not part of any compiled calling seq
1528   if (VerifyOops) {
1529     for (int i = 0; i < method->size_of_parameters(); i++) {
1530       if (is_reference_type(sig_bt[i])) {
1531         VMReg r = regs[i].first();
1532         assert(r->is_valid(), "bad oop arg");
1533         if (r->is_stack()) {
1534           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1535           __ verify_oop(temp_reg);
1536         } else {
1537           __ verify_oop(r->as_Register());
1538         }
1539       }
1540     }
1541   }
1542 }
1543 
1544 static void check_continuation_enter_argument(VMReg actual_vmreg,
1545                                               Register expected_reg,
1546                                               const char* name) {
1547   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1548   assert(actual_vmreg->as_Register() == expected_reg,
1549          "%s is in unexpected register: %s instead of %s",
1550          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1551 }
1552 
1553 
1554 //---------------------------- continuation_enter_setup ---------------------------
1555 //
1556 // Arguments:
1557 //   None.
1558 //
1559 // Results:
1560 //   rsp: pointer to blank ContinuationEntry
1561 //
1562 // Kills:
1563 //   rax
1564 //
1565 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1566   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1567   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1568   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1569 
1570   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1571   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1572 
1573   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1574   OopMap* map = new OopMap(frame_size, 0);
1575 
1576   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1577   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1578   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1579 
1580   return map;
1581 }
1582 
1583 //---------------------------- fill_continuation_entry ---------------------------
1584 //
1585 // Arguments:
1586 //   rsp: pointer to blank Continuation entry
1587 //   reg_cont_obj: pointer to the continuation
1588 //   reg_flags: flags
1589 //
1590 // Results:
1591 //   rsp: pointer to filled out ContinuationEntry
1592 //
1593 // Kills:
1594 //   rax
1595 //
1596 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1597   assert_different_registers(rax, reg_cont_obj, reg_flags);
1598 #ifdef ASSERT
1599   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1600 #endif
1601   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1602   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1603   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1604   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1605   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1606 
1607   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1608   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1609   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1610   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1611 
1612   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1613   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1614 }
1615 
1616 //---------------------------- continuation_enter_cleanup ---------------------------
1617 //
1618 // Arguments:
1619 //   rsp: pointer to the ContinuationEntry
1620 //
1621 // Results:
1622 //   rsp: pointer to the spilled rbp in the entry frame
1623 //
1624 // Kills:
1625 //   rbx
1626 //
1627 void static continuation_enter_cleanup(MacroAssembler* masm) {
1628 #ifdef ASSERT
1629   Label L_good_sp;
1630   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1631   __ jcc(Assembler::equal, L_good_sp);
1632   __ stop("Incorrect rsp at continuation_enter_cleanup");
1633   __ bind(L_good_sp);
1634 #endif
1635 
1636   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1637   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1638   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1639   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1640 
1641   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1642   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1643   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1644 }
1645 
1646 static void gen_continuation_enter(MacroAssembler* masm,
1647                                    const VMRegPair* regs,
1648                                    int& exception_offset,
1649                                    OopMapSet* oop_maps,
1650                                    int& frame_complete,
1651                                    int& stack_slots,
1652                                    int& interpreted_entry_offset,
1653                                    int& compiled_entry_offset) {
1654 
1655   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1656   int pos_cont_obj   = 0;
1657   int pos_is_cont    = 1;
1658   int pos_is_virtual = 2;
1659 
1660   // The platform-specific calling convention may present the arguments in various registers.
1661   // To simplify the rest of the code, we expect the arguments to reside at these known
1662   // registers, and we additionally check the placement here in case calling convention ever
1663   // changes.
1664   Register reg_cont_obj   = c_rarg1;
1665   Register reg_is_cont    = c_rarg2;
1666   Register reg_is_virtual = c_rarg3;
1667 
1668   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1669   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1670   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1671 
1672   // Utility methods kill rax, make sure there are no collisions
1673   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1674 
1675   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1676                          relocInfo::static_call_type);
1677 
1678   address start = __ pc();
1679 
1680   Label L_thaw, L_exit;
1681 
1682   // i2i entry used at interp_only_mode only
1683   interpreted_entry_offset = __ pc() - start;
1684   {
1685 #ifdef ASSERT
1686     Label is_interp_only;
1687     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1688     __ jcc(Assembler::notEqual, is_interp_only);
1689     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1690     __ bind(is_interp_only);
1691 #endif
1692 
1693     __ pop(rax); // return address
1694     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1695     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1696     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1697     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1698     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1699     __ push(rax); // return address
1700     __ push_cont_fastpath();
1701 
1702     __ enter();
1703 
1704     stack_slots = 2; // will be adjusted in setup
1705     OopMap* map = continuation_enter_setup(masm, stack_slots);
1706     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1707     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1708 
1709     __ verify_oop(reg_cont_obj);
1710 
1711     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1712 
1713     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1714     __ testptr(reg_is_cont, reg_is_cont);
1715     __ jcc(Assembler::notZero, L_thaw);
1716 
1717     // --- Resolve path
1718 
1719     // Make sure the call is patchable
1720     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1721     // Emit stub for static call
1722     CodeBuffer* cbuf = masm->code_section()->outer();
1723     address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1724     if (stub == nullptr) {
1725       fatal("CodeCache is full at gen_continuation_enter");
1726     }
1727     __ call(resolve);
1728     oop_maps->add_gc_map(__ pc() - start, map);
1729     __ post_call_nop();
1730 
1731     __ jmp(L_exit);
1732   }
1733 
1734   // compiled entry
1735   __ align(CodeEntryAlignment);
1736   compiled_entry_offset = __ pc() - start;
1737   __ enter();
1738 
1739   stack_slots = 2; // will be adjusted in setup
1740   OopMap* map = continuation_enter_setup(masm, stack_slots);
1741 
1742   // Frame is now completed as far as size and linkage.
1743   frame_complete = __ pc() - start;
1744 
1745   __ verify_oop(reg_cont_obj);
1746 
1747   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1748 
1749   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1750   __ testptr(reg_is_cont, reg_is_cont);
1751   __ jccb(Assembler::notZero, L_thaw);
1752 
1753   // --- call Continuation.enter(Continuation c, boolean isContinue)
1754 
1755   // Make sure the call is patchable
1756   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1757 
1758   // Emit stub for static call
1759   CodeBuffer* cbuf = masm->code_section()->outer();
1760   address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1761   if (stub == nullptr) {
1762     fatal("CodeCache is full at gen_continuation_enter");
1763   }
1764 
1765   // The call needs to be resolved. There's a special case for this in
1766   // SharedRuntime::find_callee_info_helper() which calls
1767   // LinkResolver::resolve_continuation_enter() which resolves the call to
1768   // Continuation.enter(Continuation c, boolean isContinue).
1769   __ call(resolve);
1770 
1771   oop_maps->add_gc_map(__ pc() - start, map);
1772   __ post_call_nop();
1773 
1774   __ jmpb(L_exit);
1775 
1776   // --- Thawing path
1777 
1778   __ bind(L_thaw);
1779 
1780   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1781 
1782   ContinuationEntry::_return_pc_offset = __ pc() - start;
1783   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1784   __ post_call_nop();
1785 
1786   // --- Normal exit (resolve/thawing)
1787 
1788   __ bind(L_exit);
1789 
1790   continuation_enter_cleanup(masm);
1791   __ pop(rbp);
1792   __ ret(0);
1793 
1794   // --- Exception handling path
1795 
1796   exception_offset = __ pc() - start;
1797 
1798   continuation_enter_cleanup(masm);
1799   __ pop(rbp);
1800 
1801   __ movptr(c_rarg0, r15_thread);
1802   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1803 
1804   // rax still holds the original exception oop, save it before the call
1805   __ push(rax);
1806 
1807   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1808   __ movptr(rbx, rax);
1809 
1810   // Continue at exception handler:
1811   //   rax: exception oop
1812   //   rbx: exception handler
1813   //   rdx: exception pc
1814   __ pop(rax);
1815   __ verify_oop(rax);
1816   __ pop(rdx);
1817   __ jmp(rbx);
1818 }
1819 
1820 static void gen_continuation_yield(MacroAssembler* masm,
1821                                    const VMRegPair* regs,
1822                                    OopMapSet* oop_maps,
1823                                    int& frame_complete,
1824                                    int& stack_slots,
1825                                    int& compiled_entry_offset) {
1826   enum layout {
1827     rbp_off,
1828     rbpH_off,
1829     return_off,
1830     return_off2,
1831     framesize // inclusive of return address
1832   };
1833   stack_slots = framesize /  VMRegImpl::slots_per_word;
1834   assert(stack_slots == 2, "recheck layout");
1835 
1836   address start = __ pc();
1837   compiled_entry_offset = __ pc() - start;
1838   __ enter();
1839   address the_pc = __ pc();
1840 
1841   frame_complete = the_pc - start;
1842 
1843   // This nop must be exactly at the PC we push into the frame info.
1844   // We use this nop for fast CodeBlob lookup, associate the OopMap
1845   // with it right away.
1846   __ post_call_nop();
1847   OopMap* map = new OopMap(framesize, 1);
1848   oop_maps->add_gc_map(frame_complete, map);
1849 
1850   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1851   __ movptr(c_rarg0, r15_thread);
1852   __ movptr(c_rarg1, rsp);
1853   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1854   __ reset_last_Java_frame(true);
1855 
1856   Label L_pinned;
1857 
1858   __ testptr(rax, rax);
1859   __ jcc(Assembler::notZero, L_pinned);
1860 
1861   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1862   continuation_enter_cleanup(masm);
1863   __ pop(rbp);
1864   __ ret(0);
1865 
1866   __ bind(L_pinned);
1867 
1868   // Pinned, return to caller
1869 
1870   // handle pending exception thrown by freeze
1871   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1872   Label ok;
1873   __ jcc(Assembler::equal, ok);
1874   __ leave();
1875   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1876   __ bind(ok);
1877 
1878   __ leave();
1879   __ ret(0);
1880 }
1881 
1882 static void gen_special_dispatch(MacroAssembler* masm,
1883                                  const methodHandle& method,
1884                                  const BasicType* sig_bt,
1885                                  const VMRegPair* regs) {
1886   verify_oop_args(masm, method, sig_bt, regs);
1887   vmIntrinsics::ID iid = method->intrinsic_id();
1888 
1889   // Now write the args into the outgoing interpreter space
1890   bool     has_receiver   = false;
1891   Register receiver_reg   = noreg;
1892   int      member_arg_pos = -1;
1893   Register member_reg     = noreg;
1894   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1895   if (ref_kind != 0) {
1896     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1897     member_reg = rbx;  // known to be free at this point
1898     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1899   } else if (iid == vmIntrinsics::_invokeBasic) {
1900     has_receiver = true;
1901   } else if (iid == vmIntrinsics::_linkToNative) {
1902     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1903     member_reg = rbx;  // known to be free at this point
1904   } else {
1905     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1906   }
1907 
1908   if (member_reg != noreg) {
1909     // Load the member_arg into register, if necessary.
1910     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1911     VMReg r = regs[member_arg_pos].first();
1912     if (r->is_stack()) {
1913       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1914     } else {
1915       // no data motion is needed
1916       member_reg = r->as_Register();
1917     }
1918   }
1919 
1920   if (has_receiver) {
1921     // Make sure the receiver is loaded into a register.
1922     assert(method->size_of_parameters() > 0, "oob");
1923     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1924     VMReg r = regs[0].first();
1925     assert(r->is_valid(), "bad receiver arg");
1926     if (r->is_stack()) {
1927       // Porting note:  This assumes that compiled calling conventions always
1928       // pass the receiver oop in a register.  If this is not true on some
1929       // platform, pick a temp and load the receiver from stack.
1930       fatal("receiver always in a register");
1931       receiver_reg = j_rarg0;  // known to be free at this point
1932       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1933     } else {
1934       // no data motion is needed
1935       receiver_reg = r->as_Register();
1936     }
1937   }
1938 
1939   // Figure out which address we are really jumping to:
1940   MethodHandles::generate_method_handle_dispatch(masm, iid,
1941                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1942 }
1943 
1944 // ---------------------------------------------------------------------------
1945 // Generate a native wrapper for a given method.  The method takes arguments
1946 // in the Java compiled code convention, marshals them to the native
1947 // convention (handlizes oops, etc), transitions to native, makes the call,
1948 // returns to java state (possibly blocking), unhandlizes any result and
1949 // returns.
1950 //
1951 // Critical native functions are a shorthand for the use of
1952 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1953 // functions.  The wrapper is expected to unpack the arguments before
1954 // passing them to the callee. Critical native functions leave the state _in_Java,
1955 // since they cannot stop for GC.
1956 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1957 // block and the check for pending exceptions it's impossible for them
1958 // to be thrown.
1959 //
1960 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1961                                                 const methodHandle& method,
1962                                                 int compile_id,
1963                                                 BasicType* in_sig_bt,
1964                                                 VMRegPair* in_regs,
1965                                                 BasicType ret_type) {
1966   if (method->is_continuation_native_intrinsic()) {
1967     int exception_offset = -1;
1968     OopMapSet* oop_maps = new OopMapSet();
1969     int frame_complete = -1;
1970     int stack_slots = -1;
1971     int interpreted_entry_offset = -1;
1972     int vep_offset = -1;
1973     if (method->is_continuation_enter_intrinsic()) {
1974       gen_continuation_enter(masm,
1975                              in_regs,
1976                              exception_offset,
1977                              oop_maps,
1978                              frame_complete,
1979                              stack_slots,
1980                              interpreted_entry_offset,
1981                              vep_offset);
1982     } else if (method->is_continuation_yield_intrinsic()) {
1983       gen_continuation_yield(masm,
1984                              in_regs,
1985                              oop_maps,
1986                              frame_complete,
1987                              stack_slots,
1988                              vep_offset);
1989     } else {
1990       guarantee(false, "Unknown Continuation native intrinsic");
1991     }
1992 
1993 #ifdef ASSERT
1994     if (method->is_continuation_enter_intrinsic()) {
1995       assert(interpreted_entry_offset != -1, "Must be set");
1996       assert(exception_offset != -1,         "Must be set");
1997     } else {
1998       assert(interpreted_entry_offset == -1, "Must be unset");
1999       assert(exception_offset == -1,         "Must be unset");
2000     }
2001     assert(frame_complete != -1,    "Must be set");
2002     assert(stack_slots != -1,       "Must be set");
2003     assert(vep_offset != -1,        "Must be set");
2004 #endif
2005 
2006     __ flush();
2007     nmethod* nm = nmethod::new_native_nmethod(method,
2008                                               compile_id,
2009                                               masm->code(),
2010                                               vep_offset,
2011                                               frame_complete,
2012                                               stack_slots,
2013                                               in_ByteSize(-1),
2014                                               in_ByteSize(-1),
2015                                               oop_maps,
2016                                               exception_offset);
2017     if (method->is_continuation_enter_intrinsic()) {
2018       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2019     } else if (method->is_continuation_yield_intrinsic()) {
2020       _cont_doYield_stub = nm;
2021     }
2022     return nm;
2023   }
2024 
2025   if (method->is_method_handle_intrinsic()) {
2026     vmIntrinsics::ID iid = method->intrinsic_id();
2027     intptr_t start = (intptr_t)__ pc();
2028     int vep_offset = ((intptr_t)__ pc()) - start;
2029     gen_special_dispatch(masm,
2030                          method,
2031                          in_sig_bt,
2032                          in_regs);
2033     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
2034     __ flush();
2035     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
2036     return nmethod::new_native_nmethod(method,
2037                                        compile_id,
2038                                        masm->code(),
2039                                        vep_offset,
2040                                        frame_complete,
2041                                        stack_slots / VMRegImpl::slots_per_word,
2042                                        in_ByteSize(-1),
2043                                        in_ByteSize(-1),
2044                                        nullptr);
2045   }
2046   address native_func = method->native_function();
2047   assert(native_func != nullptr, "must have function");
2048 
2049   // An OopMap for lock (and class if static)
2050   OopMapSet *oop_maps = new OopMapSet();
2051   intptr_t start = (intptr_t)__ pc();
2052 
2053   // We have received a description of where all the java arg are located
2054   // on entry to the wrapper. We need to convert these args to where
2055   // the jni function will expect them. To figure out where they go
2056   // we convert the java signature to a C signature by inserting
2057   // the hidden arguments as arg[0] and possibly arg[1] (static method)
2058 
2059   const int total_in_args = method->size_of_parameters();
2060   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2061 
2062   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2063   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2064   BasicType* in_elem_bt = nullptr;
2065 
2066   int argc = 0;
2067   out_sig_bt[argc++] = T_ADDRESS;
2068   if (method->is_static()) {
2069     out_sig_bt[argc++] = T_OBJECT;
2070   }
2071 
2072   for (int i = 0; i < total_in_args ; i++ ) {
2073     out_sig_bt[argc++] = in_sig_bt[i];
2074   }
2075 
2076   // Now figure out where the args must be stored and how much stack space
2077   // they require.
2078   int out_arg_slots;
2079   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
2080 
2081   // Compute framesize for the wrapper.  We need to handlize all oops in
2082   // incoming registers
2083 
2084   // Calculate the total number of stack slots we will need.
2085 
2086   // First count the abi requirement plus all of the outgoing args
2087   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2088 
2089   // Now the space for the inbound oop handle area
2090   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
2091 
2092   int oop_handle_offset = stack_slots;
2093   stack_slots += total_save_slots;
2094 
2095   // Now any space we need for handlizing a klass if static method
2096 
2097   int klass_slot_offset = 0;
2098   int klass_offset = -1;
2099   int lock_slot_offset = 0;
2100   bool is_static = false;
2101 
2102   if (method->is_static()) {
2103     klass_slot_offset = stack_slots;
2104     stack_slots += VMRegImpl::slots_per_word;
2105     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2106     is_static = true;
2107   }
2108 
2109   // Plus a lock if needed
2110 
2111   if (method->is_synchronized()) {
2112     lock_slot_offset = stack_slots;
2113     stack_slots += VMRegImpl::slots_per_word;
2114   }
2115 
2116   // Now a place (+2) to save return values or temp during shuffling
2117   // + 4 for return address (which we own) and saved rbp
2118   stack_slots += 6;
2119 
2120   // Ok The space we have allocated will look like:
2121   //
2122   //
2123   // FP-> |                     |
2124   //      |---------------------|
2125   //      | 2 slots for moves   |
2126   //      |---------------------|
2127   //      | lock box (if sync)  |
2128   //      |---------------------| <- lock_slot_offset
2129   //      | klass (if static)   |
2130   //      |---------------------| <- klass_slot_offset
2131   //      | oopHandle area      |
2132   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2133   //      | outbound memory     |
2134   //      | based arguments     |
2135   //      |                     |
2136   //      |---------------------|
2137   //      |                     |
2138   // SP-> | out_preserved_slots |
2139   //
2140   //
2141 
2142 
2143   // Now compute actual number of stack words we need rounding to make
2144   // stack properly aligned.
2145   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2146 
2147   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2148 
2149   // First thing make an ic check to see if we should even be here
2150 
2151   // We are free to use all registers as temps without saving them and
2152   // restoring them except rbp. rbp is the only callee save register
2153   // as far as the interpreter and the compiler(s) are concerned.
2154 
2155 
2156   const Register ic_reg = rax;
2157   const Register receiver = j_rarg0;
2158 
2159   Label hit;
2160   Label exception_pending;
2161 
2162   assert_different_registers(ic_reg, receiver, rscratch1, rscratch2);
2163   __ verify_oop(receiver);
2164   __ load_klass(rscratch1, receiver, rscratch2);
2165   __ cmpq(ic_reg, rscratch1);
2166   __ jcc(Assembler::equal, hit);
2167 
2168   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
2169 
2170   // Verified entry point must be aligned
2171   __ align(8);
2172 
2173   __ bind(hit);
2174 
2175   int vep_offset = ((intptr_t)__ pc()) - start;
2176 
2177   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2178     Label L_skip_barrier;
2179     Register klass = r10;
2180     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2181     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2182 
2183     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2184 
2185     __ bind(L_skip_barrier);
2186   }
2187 
2188 #ifdef COMPILER1
2189   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2190   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2191     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2192   }
2193 #endif // COMPILER1
2194 
2195   // The instruction at the verified entry point must be 5 bytes or longer
2196   // because it can be patched on the fly by make_non_entrant. The stack bang
2197   // instruction fits that requirement.
2198 
2199   // Generate stack overflow check
2200   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2201 
2202   // Generate a new frame for the wrapper.
2203   __ enter();
2204   // -2 because return address is already present and so is saved rbp
2205   __ subptr(rsp, stack_size - 2*wordSize);
2206 
2207   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2208   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2209   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2210 
2211   // Frame is now completed as far as size and linkage.
2212   int frame_complete = ((intptr_t)__ pc()) - start;
2213 
2214     if (UseRTMLocking) {
2215       // Abort RTM transaction before calling JNI
2216       // because critical section will be large and will be
2217       // aborted anyway. Also nmethod could be deoptimized.
2218       __ xabort(0);
2219     }
2220 
2221 #ifdef ASSERT
2222   __ check_stack_alignment(rsp, "improperly aligned stack");
2223 #endif /* ASSERT */
2224 
2225 
2226   // We use r14 as the oop handle for the receiver/klass
2227   // It is callee save so it survives the call to native
2228 
2229   const Register oop_handle_reg = r14;
2230 
2231   //
2232   // We immediately shuffle the arguments so that any vm call we have to
2233   // make from here on out (sync slow path, jvmti, etc.) we will have
2234   // captured the oops from our caller and have a valid oopMap for
2235   // them.
2236 
2237   // -----------------
2238   // The Grand Shuffle
2239 
2240   // The Java calling convention is either equal (linux) or denser (win64) than the
2241   // c calling convention. However the because of the jni_env argument the c calling
2242   // convention always has at least one more (and two for static) arguments than Java.
2243   // Therefore if we move the args from java -> c backwards then we will never have
2244   // a register->register conflict and we don't have to build a dependency graph
2245   // and figure out how to break any cycles.
2246   //
2247 
2248   // Record esp-based slot for receiver on stack for non-static methods
2249   int receiver_offset = -1;
2250 
2251   // This is a trick. We double the stack slots so we can claim
2252   // the oops in the caller's frame. Since we are sure to have
2253   // more args than the caller doubling is enough to make
2254   // sure we can capture all the incoming oop args from the
2255   // caller.
2256   //
2257   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2258 
2259   // Mark location of rbp (someday)
2260   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2261 
2262   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2263   // All inbound args are referenced based on rbp and all outbound args via rsp.
2264 
2265 
2266 #ifdef ASSERT
2267   bool reg_destroyed[Register::number_of_registers];
2268   bool freg_destroyed[XMMRegister::number_of_registers];
2269   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2270     reg_destroyed[r] = false;
2271   }
2272   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2273     freg_destroyed[f] = false;
2274   }
2275 
2276 #endif /* ASSERT */
2277 
2278   // For JNI natives the incoming and outgoing registers are offset upwards.
2279   GrowableArray<int> arg_order(2 * total_in_args);
2280 
2281   VMRegPair tmp_vmreg;
2282   tmp_vmreg.set2(rbx->as_VMReg());
2283 
2284   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2285     arg_order.push(i);
2286     arg_order.push(c_arg);
2287   }
2288 
2289   int temploc = -1;
2290   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2291     int i = arg_order.at(ai);
2292     int c_arg = arg_order.at(ai + 1);
2293     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2294 #ifdef ASSERT
2295     if (in_regs[i].first()->is_Register()) {
2296       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2297     } else if (in_regs[i].first()->is_XMMRegister()) {
2298       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2299     }
2300     if (out_regs[c_arg].first()->is_Register()) {
2301       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2302     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2303       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2304     }
2305 #endif /* ASSERT */
2306     switch (in_sig_bt[i]) {
2307       case T_ARRAY:
2308       case T_OBJECT:
2309         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2310                     ((i == 0) && (!is_static)),
2311                     &receiver_offset);
2312         break;
2313       case T_VOID:
2314         break;
2315 
2316       case T_FLOAT:
2317         __ float_move(in_regs[i], out_regs[c_arg]);
2318           break;
2319 
2320       case T_DOUBLE:
2321         assert( i + 1 < total_in_args &&
2322                 in_sig_bt[i + 1] == T_VOID &&
2323                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2324         __ double_move(in_regs[i], out_regs[c_arg]);
2325         break;
2326 
2327       case T_LONG :
2328         __ long_move(in_regs[i], out_regs[c_arg]);
2329         break;
2330 
2331       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2332 
2333       default:
2334         __ move32_64(in_regs[i], out_regs[c_arg]);
2335     }
2336   }
2337 
2338   int c_arg;
2339 
2340   // Pre-load a static method's oop into r14.  Used both by locking code and
2341   // the normal JNI call code.
2342   // point c_arg at the first arg that is already loaded in case we
2343   // need to spill before we call out
2344   c_arg = total_c_args - total_in_args;
2345 
2346   if (method->is_static()) {
2347 
2348     //  load oop into a register
2349     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2350 
2351     // Now handlize the static class mirror it's known not-null.
2352     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2353     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2354 
2355     // Now get the handle
2356     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2357     // store the klass handle as second argument
2358     __ movptr(c_rarg1, oop_handle_reg);
2359     // and protect the arg if we must spill
2360     c_arg--;
2361   }
2362 
2363   // Change state to native (we save the return address in the thread, since it might not
2364   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2365   // points into the right code segment. It does not have to be the correct return pc.
2366   // We use the same pc/oopMap repeatedly when we call out
2367 
2368   intptr_t the_pc = (intptr_t) __ pc();
2369   oop_maps->add_gc_map(the_pc - start, map);
2370 
2371   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2372 
2373 
2374   // We have all of the arguments setup at this point. We must not touch any register
2375   // argument registers at this point (what if we save/restore them there are no oop?
2376 
2377   {
2378     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2379     // protect the args we've loaded
2380     save_args(masm, total_c_args, c_arg, out_regs);
2381     __ mov_metadata(c_rarg1, method());
2382     __ call_VM_leaf(
2383       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2384       r15_thread, c_rarg1);
2385     restore_args(masm, total_c_args, c_arg, out_regs);
2386   }
2387 
2388   // RedefineClasses() tracing support for obsolete method entry
2389   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2390     // protect the args we've loaded
2391     save_args(masm, total_c_args, c_arg, out_regs);
2392     __ mov_metadata(c_rarg1, method());
2393     __ call_VM_leaf(
2394       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2395       r15_thread, c_rarg1);
2396     restore_args(masm, total_c_args, c_arg, out_regs);
2397   }
2398 
2399   // Lock a synchronized method
2400 
2401   // Register definitions used by locking and unlocking
2402 
2403   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2404   const Register obj_reg  = rbx;  // Will contain the oop
2405   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2406   const Register old_hdr  = r13;  // value of old header at unlock time
2407 
2408   Label slow_path_lock;
2409   Label lock_done;
2410 
2411   if (method->is_synchronized()) {
2412     Label count_mon;
2413 
2414     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2415 
2416     // Get the handle (the 2nd argument)
2417     __ mov(oop_handle_reg, c_rarg1);
2418 
2419     // Get address of the box
2420 
2421     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2422 
2423     // Load the oop from the handle
2424     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2425 
2426     if (LockingMode == LM_MONITOR) {
2427       __ jmp(slow_path_lock);
2428     } else if (LockingMode == LM_LEGACY) {
2429       // Load immediate 1 into swap_reg %rax
2430       __ movl(swap_reg, 1);
2431 
2432       // Load (object->mark() | 1) into swap_reg %rax
2433       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2434       if (EnableValhalla) {
2435         // Mask inline_type bit such that we go to the slow path if object is an inline type
2436         __ andptr(swap_reg, ~((int) markWord::inline_type_bit_in_place));
2437       }
2438 
2439       // Save (object->mark() | 1) into BasicLock's displaced header
2440       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2441 
2442       // src -> dest iff dest == rax else rax <- dest
2443       __ lock();
2444       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2445       __ jcc(Assembler::equal, count_mon);
2446 
2447       // Hmm should this move to the slow path code area???
2448 
2449       // Test if the oopMark is an obvious stack pointer, i.e.,
2450       //  1) (mark & 3) == 0, and
2451       //  2) rsp <= mark < mark + os::pagesize()
2452       // These 3 tests can be done by evaluating the following
2453       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2454       // assuming both stack pointer and pagesize have their
2455       // least significant 2 bits clear.
2456       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2457 
2458       __ subptr(swap_reg, rsp);
2459       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2460 
2461       // Save the test result, for recursive case, the result is zero
2462       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2463       __ jcc(Assembler::notEqual, slow_path_lock);
2464     } else {
2465       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2466       // Load object header
2467       __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2468       __ lightweight_lock(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2469     }
2470     __ bind(count_mon);
2471     __ inc_held_monitor_count();
2472 
2473     // Slow path will re-enter here
2474     __ bind(lock_done);
2475   }
2476 
2477   // Finally just about ready to make the JNI call
2478 
2479   // get JNIEnv* which is first argument to native
2480   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2481 
2482   // Now set thread in native
2483   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2484 
2485   __ call(RuntimeAddress(native_func));
2486 
2487   // Verify or restore cpu control state after JNI call
2488   __ restore_cpu_control_state_after_jni(rscratch1);
2489 
2490   // Unpack native results.
2491   switch (ret_type) {
2492   case T_BOOLEAN: __ c2bool(rax);            break;
2493   case T_CHAR   : __ movzwl(rax, rax);      break;
2494   case T_BYTE   : __ sign_extend_byte (rax); break;
2495   case T_SHORT  : __ sign_extend_short(rax); break;
2496   case T_INT    : /* nothing to do */        break;
2497   case T_DOUBLE :
2498   case T_FLOAT  :
2499     // Result is in xmm0 we'll save as needed
2500     break;
2501   case T_ARRAY:                 // Really a handle
2502   case T_OBJECT:                // Really a handle
2503       break; // can't de-handlize until after safepoint check
2504   case T_VOID: break;
2505   case T_LONG: break;
2506   default       : ShouldNotReachHere();
2507   }
2508 
2509   Label after_transition;
2510 
2511   // Switch thread to "native transition" state before reading the synchronization state.
2512   // This additional state is necessary because reading and testing the synchronization
2513   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2514   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2515   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2516   //     Thread A is resumed to finish this native method, but doesn't block here since it
2517   //     didn't see any synchronization is progress, and escapes.
2518   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2519 
2520   // Force this write out before the read below
2521   if (!UseSystemMemoryBarrier) {
2522     __ membar(Assembler::Membar_mask_bits(
2523               Assembler::LoadLoad | Assembler::LoadStore |
2524               Assembler::StoreLoad | Assembler::StoreStore));
2525   }
2526 
2527   // check for safepoint operation in progress and/or pending suspend requests
2528   {
2529     Label Continue;
2530     Label slow_path;
2531 
2532     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2533 
2534     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2535     __ jcc(Assembler::equal, Continue);
2536     __ bind(slow_path);
2537 
2538     // Don't use call_VM as it will see a possible pending exception and forward it
2539     // and never return here preventing us from clearing _last_native_pc down below.
2540     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2541     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2542     // by hand.
2543     //
2544     __ vzeroupper();
2545     save_native_result(masm, ret_type, stack_slots);
2546     __ mov(c_rarg0, r15_thread);
2547     __ mov(r12, rsp); // remember sp
2548     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2549     __ andptr(rsp, -16); // align stack as required by ABI
2550     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2551     __ mov(rsp, r12); // restore sp
2552     __ reinit_heapbase();
2553     // Restore any method result value
2554     restore_native_result(masm, ret_type, stack_slots);
2555     __ bind(Continue);
2556   }
2557 
2558   // change thread state
2559   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2560   __ bind(after_transition);
2561 
2562   Label reguard;
2563   Label reguard_done;
2564   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2565   __ jcc(Assembler::equal, reguard);
2566   __ bind(reguard_done);
2567 
2568   // native result if any is live
2569 
2570   // Unlock
2571   Label slow_path_unlock;
2572   Label unlock_done;
2573   if (method->is_synchronized()) {
2574 
2575     Label fast_done;
2576 
2577     // Get locked oop from the handle we passed to jni
2578     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2579 
2580     if (LockingMode == LM_LEGACY) {
2581       Label not_recur;
2582       // Simple recursive lock?
2583       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2584       __ jcc(Assembler::notEqual, not_recur);
2585       __ dec_held_monitor_count();
2586       __ jmpb(fast_done);
2587       __ bind(not_recur);
2588     }
2589 
2590     // Must save rax if it is live now because cmpxchg must use it
2591     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2592       save_native_result(masm, ret_type, stack_slots);
2593     }
2594 
2595     if (LockingMode == LM_MONITOR) {
2596       __ jmp(slow_path_unlock);
2597     } else if (LockingMode == LM_LEGACY) {
2598       // get address of the stack lock
2599       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2600       //  get old displaced header
2601       __ movptr(old_hdr, Address(rax, 0));
2602 
2603       // Atomic swap old header if oop still contains the stack lock
2604       __ lock();
2605       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2606       __ jcc(Assembler::notEqual, slow_path_unlock);
2607       __ dec_held_monitor_count();
2608     } else {
2609       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2610       __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2611       __ andptr(swap_reg, ~(int32_t)markWord::lock_mask_in_place);
2612       __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2613       __ dec_held_monitor_count();
2614     }
2615 
2616     // slow path re-enters here
2617     __ bind(unlock_done);
2618     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2619       restore_native_result(masm, ret_type, stack_slots);
2620     }
2621 
2622     __ bind(fast_done);
2623   }
2624   {
2625     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2626     save_native_result(masm, ret_type, stack_slots);
2627     __ mov_metadata(c_rarg1, method());
2628     __ call_VM_leaf(
2629          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2630          r15_thread, c_rarg1);
2631     restore_native_result(masm, ret_type, stack_slots);
2632   }
2633 
2634   __ reset_last_Java_frame(false);
2635 
2636   // Unbox oop result, e.g. JNIHandles::resolve value.
2637   if (is_reference_type(ret_type)) {
2638     __ resolve_jobject(rax /* value */,
2639                        r15_thread /* thread */,
2640                        rcx /* tmp */);
2641   }
2642 
2643   if (CheckJNICalls) {
2644     // clear_pending_jni_exception_check
2645     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2646   }
2647 
2648   // reset handle block
2649   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2650   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2651 
2652   // pop our frame
2653 
2654   __ leave();
2655 
2656   // Any exception pending?
2657   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2658   __ jcc(Assembler::notEqual, exception_pending);
2659 
2660   // Return
2661 
2662   __ ret(0);
2663 
2664   // Unexpected paths are out of line and go here
2665 
2666   // forward the exception
2667   __ bind(exception_pending);
2668 
2669   // and forward the exception
2670   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2671 
2672   // Slow path locking & unlocking
2673   if (method->is_synchronized()) {
2674 
2675     // BEGIN Slow path lock
2676     __ bind(slow_path_lock);
2677 
2678     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2679     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2680 
2681     // protect the args we've loaded
2682     save_args(masm, total_c_args, c_arg, out_regs);
2683 
2684     __ mov(c_rarg0, obj_reg);
2685     __ mov(c_rarg1, lock_reg);
2686     __ mov(c_rarg2, r15_thread);
2687 
2688     // Not a leaf but we have last_Java_frame setup as we want
2689     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2690     restore_args(masm, total_c_args, c_arg, out_regs);
2691 
2692 #ifdef ASSERT
2693     { Label L;
2694     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2695     __ jcc(Assembler::equal, L);
2696     __ stop("no pending exception allowed on exit from monitorenter");
2697     __ bind(L);
2698     }
2699 #endif
2700     __ jmp(lock_done);
2701 
2702     // END Slow path lock
2703 
2704     // BEGIN Slow path unlock
2705     __ bind(slow_path_unlock);
2706 
2707     // If we haven't already saved the native result we must save it now as xmm registers
2708     // are still exposed.
2709     __ vzeroupper();
2710     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2711       save_native_result(masm, ret_type, stack_slots);
2712     }
2713 
2714     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2715 
2716     __ mov(c_rarg0, obj_reg);
2717     __ mov(c_rarg2, r15_thread);
2718     __ mov(r12, rsp); // remember sp
2719     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2720     __ andptr(rsp, -16); // align stack as required by ABI
2721 
2722     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2723     // NOTE that obj_reg == rbx currently
2724     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2725     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2726 
2727     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2728     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2729     __ mov(rsp, r12); // restore sp
2730     __ reinit_heapbase();
2731 #ifdef ASSERT
2732     {
2733       Label L;
2734       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2735       __ jcc(Assembler::equal, L);
2736       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2737       __ bind(L);
2738     }
2739 #endif /* ASSERT */
2740 
2741     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2742 
2743     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2744       restore_native_result(masm, ret_type, stack_slots);
2745     }
2746     __ jmp(unlock_done);
2747 
2748     // END Slow path unlock
2749 
2750   } // synchronized
2751 
2752   // SLOW PATH Reguard the stack if needed
2753 
2754   __ bind(reguard);
2755   __ vzeroupper();
2756   save_native_result(masm, ret_type, stack_slots);
2757   __ mov(r12, rsp); // remember sp
2758   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2759   __ andptr(rsp, -16); // align stack as required by ABI
2760   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2761   __ mov(rsp, r12); // restore sp
2762   __ reinit_heapbase();
2763   restore_native_result(masm, ret_type, stack_slots);
2764   // and continue
2765   __ jmp(reguard_done);
2766 
2767 
2768 
2769   __ flush();
2770 
2771   nmethod *nm = nmethod::new_native_nmethod(method,
2772                                             compile_id,
2773                                             masm->code(),
2774                                             vep_offset,
2775                                             frame_complete,
2776                                             stack_slots / VMRegImpl::slots_per_word,
2777                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2778                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2779                                             oop_maps);
2780 
2781   return nm;
2782 }
2783 
2784 // this function returns the adjust size (in number of words) to a c2i adapter
2785 // activation for use during deoptimization
2786 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2787   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2788 }
2789 
2790 
2791 uint SharedRuntime::out_preserve_stack_slots() {
2792   return 0;
2793 }
2794 
2795 
2796 // Number of stack slots between incoming argument block and the start of
2797 // a new frame.  The PROLOG must add this many slots to the stack.  The
2798 // EPILOG must remove this many slots.  amd64 needs two slots for
2799 // return address.
2800 uint SharedRuntime::in_preserve_stack_slots() {
2801   return 4 + 2 * VerifyStackAtCalls;
2802 }
2803 
2804 //------------------------------generate_deopt_blob----------------------------
2805 void SharedRuntime::generate_deopt_blob() {
2806   // Allocate space for the code
2807   ResourceMark rm;
2808   // Setup code generation tools
2809   int pad = 0;
2810   if (UseAVX > 2) {
2811     pad += 1024;
2812   }
2813 #if INCLUDE_JVMCI
2814   if (EnableJVMCI) {
2815     pad += 512; // Increase the buffer size when compiling for JVMCI
2816   }
2817 #endif
2818   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2819   MacroAssembler* masm = new MacroAssembler(&buffer);
2820   int frame_size_in_words;
2821   OopMap* map = nullptr;
2822   OopMapSet *oop_maps = new OopMapSet();
2823 
2824   // -------------
2825   // This code enters when returning to a de-optimized nmethod.  A return
2826   // address has been pushed on the stack, and return values are in
2827   // registers.
2828   // If we are doing a normal deopt then we were called from the patched
2829   // nmethod from the point we returned to the nmethod. So the return
2830   // address on the stack is wrong by NativeCall::instruction_size
2831   // We will adjust the value so it looks like we have the original return
2832   // address on the stack (like when we eagerly deoptimized).
2833   // In the case of an exception pending when deoptimizing, we enter
2834   // with a return address on the stack that points after the call we patched
2835   // into the exception handler. We have the following register state from,
2836   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2837   //    rax: exception oop
2838   //    rbx: exception handler
2839   //    rdx: throwing pc
2840   // So in this case we simply jam rdx into the useless return address and
2841   // the stack looks just like we want.
2842   //
2843   // At this point we need to de-opt.  We save the argument return
2844   // registers.  We call the first C routine, fetch_unroll_info().  This
2845   // routine captures the return values and returns a structure which
2846   // describes the current frame size and the sizes of all replacement frames.
2847   // The current frame is compiled code and may contain many inlined
2848   // functions, each with their own JVM state.  We pop the current frame, then
2849   // push all the new frames.  Then we call the C routine unpack_frames() to
2850   // populate these frames.  Finally unpack_frames() returns us the new target
2851   // address.  Notice that callee-save registers are BLOWN here; they have
2852   // already been captured in the vframeArray at the time the return PC was
2853   // patched.
2854   address start = __ pc();
2855   Label cont;
2856 
2857   // Prolog for non exception case!
2858 
2859   // Save everything in sight.
2860   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2861 
2862   // Normal deoptimization.  Save exec mode for unpack_frames.
2863   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2864   __ jmp(cont);
2865 
2866   int reexecute_offset = __ pc() - start;
2867 #if INCLUDE_JVMCI && !defined(COMPILER1)
2868   if (EnableJVMCI && UseJVMCICompiler) {
2869     // JVMCI does not use this kind of deoptimization
2870     __ should_not_reach_here();
2871   }
2872 #endif
2873 
2874   // Reexecute case
2875   // return address is the pc describes what bci to do re-execute at
2876 
2877   // No need to update map as each call to save_live_registers will produce identical oopmap
2878   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2879 
2880   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2881   __ jmp(cont);
2882 
2883 #if INCLUDE_JVMCI
2884   Label after_fetch_unroll_info_call;
2885   int implicit_exception_uncommon_trap_offset = 0;
2886   int uncommon_trap_offset = 0;
2887 
2888   if (EnableJVMCI) {
2889     implicit_exception_uncommon_trap_offset = __ pc() - start;
2890 
2891     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2892     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2893 
2894     uncommon_trap_offset = __ pc() - start;
2895 
2896     // Save everything in sight.
2897     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2898     // fetch_unroll_info needs to call last_java_frame()
2899     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2900 
2901     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2902     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2903 
2904     __ movl(r14, Deoptimization::Unpack_reexecute);
2905     __ mov(c_rarg0, r15_thread);
2906     __ movl(c_rarg2, r14); // exec mode
2907     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2908     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2909 
2910     __ reset_last_Java_frame(false);
2911 
2912     __ jmp(after_fetch_unroll_info_call);
2913   } // EnableJVMCI
2914 #endif // INCLUDE_JVMCI
2915 
2916   int exception_offset = __ pc() - start;
2917 
2918   // Prolog for exception case
2919 
2920   // all registers are dead at this entry point, except for rax, and
2921   // rdx which contain the exception oop and exception pc
2922   // respectively.  Set them in TLS and fall thru to the
2923   // unpack_with_exception_in_tls entry point.
2924 
2925   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2926   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2927 
2928   int exception_in_tls_offset = __ pc() - start;
2929 
2930   // new implementation because exception oop is now passed in JavaThread
2931 
2932   // Prolog for exception case
2933   // All registers must be preserved because they might be used by LinearScan
2934   // Exceptiop oop and throwing PC are passed in JavaThread
2935   // tos: stack at point of call to method that threw the exception (i.e. only
2936   // args are on the stack, no return address)
2937 
2938   // make room on stack for the return address
2939   // It will be patched later with the throwing pc. The correct value is not
2940   // available now because loading it from memory would destroy registers.
2941   __ push(0);
2942 
2943   // Save everything in sight.
2944   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2945 
2946   // Now it is safe to overwrite any register
2947 
2948   // Deopt during an exception.  Save exec mode for unpack_frames.
2949   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2950 
2951   // load throwing pc from JavaThread and patch it as the return address
2952   // of the current frame. Then clear the field in JavaThread
2953 
2954   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2955   __ movptr(Address(rbp, wordSize), rdx);
2956   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2957 
2958 #ifdef ASSERT
2959   // verify that there is really an exception oop in JavaThread
2960   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2961   __ verify_oop(rax);
2962 
2963   // verify that there is no pending exception
2964   Label no_pending_exception;
2965   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2966   __ testptr(rax, rax);
2967   __ jcc(Assembler::zero, no_pending_exception);
2968   __ stop("must not have pending exception here");
2969   __ bind(no_pending_exception);
2970 #endif
2971 
2972   __ bind(cont);
2973 
2974   // Call C code.  Need thread and this frame, but NOT official VM entry
2975   // crud.  We cannot block on this call, no GC can happen.
2976   //
2977   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2978 
2979   // fetch_unroll_info needs to call last_java_frame().
2980 
2981   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2982 #ifdef ASSERT
2983   { Label L;
2984     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2985     __ jcc(Assembler::equal, L);
2986     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2987     __ bind(L);
2988   }
2989 #endif // ASSERT
2990   __ mov(c_rarg0, r15_thread);
2991   __ movl(c_rarg1, r14); // exec_mode
2992   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2993 
2994   // Need to have an oopmap that tells fetch_unroll_info where to
2995   // find any register it might need.
2996   oop_maps->add_gc_map(__ pc() - start, map);
2997 
2998   __ reset_last_Java_frame(false);
2999 
3000 #if INCLUDE_JVMCI
3001   if (EnableJVMCI) {
3002     __ bind(after_fetch_unroll_info_call);
3003   }
3004 #endif
3005 
3006   // Load UnrollBlock* into rdi
3007   __ mov(rdi, rax);
3008 
3009   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
3010    Label noException;
3011   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
3012   __ jcc(Assembler::notEqual, noException);
3013   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3014   // QQQ this is useless it was null above
3015   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3016   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3017   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3018 
3019   __ verify_oop(rax);
3020 
3021   // Overwrite the result registers with the exception results.
3022   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3023   // I think this is useless
3024   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
3025 
3026   __ bind(noException);
3027 
3028   // Only register save data is on the stack.
3029   // Now restore the result registers.  Everything else is either dead
3030   // or captured in the vframeArray.
3031   RegisterSaver::restore_result_registers(masm);
3032 
3033   // All of the register save area has been popped of the stack. Only the
3034   // return address remains.
3035 
3036   // Pop all the frames we must move/replace.
3037   //
3038   // Frame picture (youngest to oldest)
3039   // 1: self-frame (no frame link)
3040   // 2: deopting frame  (no frame link)
3041   // 3: caller of deopting frame (could be compiled/interpreted).
3042   //
3043   // Note: by leaving the return address of self-frame on the stack
3044   // and using the size of frame 2 to adjust the stack
3045   // when we are done the return to frame 3 will still be on the stack.
3046 
3047   // Pop deoptimized frame
3048   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3049   __ addptr(rsp, rcx);
3050 
3051   // rsp should be pointing at the return address to the caller (3)
3052 
3053   // Pick up the initial fp we should save
3054   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3055   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3056 
3057 #ifdef ASSERT
3058   // Compilers generate code that bang the stack by as much as the
3059   // interpreter would need. So this stack banging should never
3060   // trigger a fault. Verify that it does not on non product builds.
3061   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3062   __ bang_stack_size(rbx, rcx);
3063 #endif
3064 
3065   // Load address of array of frame pcs into rcx
3066   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3067 
3068   // Trash the old pc
3069   __ addptr(rsp, wordSize);
3070 
3071   // Load address of array of frame sizes into rsi
3072   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3073 
3074   // Load counter into rdx
3075   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3076 
3077   // Now adjust the caller's stack to make up for the extra locals
3078   // but record the original sp so that we can save it in the skeletal interpreter
3079   // frame and the stack walking of interpreter_sender will get the unextended sp
3080   // value and not the "real" sp value.
3081 
3082   const Register sender_sp = r8;
3083 
3084   __ mov(sender_sp, rsp);
3085   __ movl(rbx, Address(rdi,
3086                        Deoptimization::UnrollBlock::
3087                        caller_adjustment_offset()));
3088   __ subptr(rsp, rbx);
3089 
3090   // Push interpreter frames in a loop
3091   Label loop;
3092   __ bind(loop);
3093   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3094   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3095   __ pushptr(Address(rcx, 0));          // Save return address
3096   __ enter();                           // Save old & set new ebp
3097   __ subptr(rsp, rbx);                  // Prolog
3098   // This value is corrected by layout_activation_impl
3099   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3100   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3101   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3102   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3103   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3104   __ decrementl(rdx);                   // Decrement counter
3105   __ jcc(Assembler::notZero, loop);
3106   __ pushptr(Address(rcx, 0));          // Save final return address
3107 
3108   // Re-push self-frame
3109   __ enter();                           // Save old & set new ebp
3110 
3111   // Allocate a full sized register save area.
3112   // Return address and rbp are in place, so we allocate two less words.
3113   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3114 
3115   // Restore frame locals after moving the frame
3116   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3117   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3118 
3119   // Call C code.  Need thread but NOT official VM entry
3120   // crud.  We cannot block on this call, no GC can happen.  Call should
3121   // restore return values to their stack-slots with the new SP.
3122   //
3123   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3124 
3125   // Use rbp because the frames look interpreted now
3126   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3127   // Don't need the precise return PC here, just precise enough to point into this code blob.
3128   address the_pc = __ pc();
3129   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3130 
3131   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3132   __ mov(c_rarg0, r15_thread);
3133   __ movl(c_rarg1, r14); // second arg: exec_mode
3134   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3135   // Revert SP alignment after call since we're going to do some SP relative addressing below
3136   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3137 
3138   // Set an oopmap for the call site
3139   // Use the same PC we used for the last java frame
3140   oop_maps->add_gc_map(the_pc - start,
3141                        new OopMap( frame_size_in_words, 0 ));
3142 
3143   // Clear fp AND pc
3144   __ reset_last_Java_frame(true);
3145 
3146   // Collect return values
3147   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3148   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3149   // I think this is useless (throwing pc?)
3150   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3151 
3152   // Pop self-frame.
3153   __ leave();                           // Epilog
3154 
3155   // Jump to interpreter
3156   __ ret(0);
3157 
3158   // Make sure all code is generated
3159   masm->flush();
3160 
3161   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3162   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3163 #if INCLUDE_JVMCI
3164   if (EnableJVMCI) {
3165     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3166     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3167   }
3168 #endif
3169 }
3170 
3171 #ifdef COMPILER2
3172 //------------------------------generate_uncommon_trap_blob--------------------
3173 void SharedRuntime::generate_uncommon_trap_blob() {
3174   // Allocate space for the code
3175   ResourceMark rm;
3176   // Setup code generation tools
3177   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
3178   MacroAssembler* masm = new MacroAssembler(&buffer);
3179 
3180   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3181 
3182   address start = __ pc();
3183 
3184   if (UseRTMLocking) {
3185     // Abort RTM transaction before possible nmethod deoptimization.
3186     __ xabort(0);
3187   }
3188 
3189   // Push self-frame.  We get here with a return address on the
3190   // stack, so rsp is 8-byte aligned until we allocate our frame.
3191   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
3192 
3193   // No callee saved registers. rbp is assumed implicitly saved
3194   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3195 
3196   // compiler left unloaded_class_index in j_rarg0 move to where the
3197   // runtime expects it.
3198   __ movl(c_rarg1, j_rarg0);
3199 
3200   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3201 
3202   // Call C code.  Need thread but NOT official VM entry
3203   // crud.  We cannot block on this call, no GC can happen.  Call should
3204   // capture callee-saved registers as well as return values.
3205   // Thread is in rdi already.
3206   //
3207   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
3208 
3209   __ mov(c_rarg0, r15_thread);
3210   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
3211   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3212 
3213   // Set an oopmap for the call site
3214   OopMapSet* oop_maps = new OopMapSet();
3215   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
3216 
3217   // location of rbp is known implicitly by the frame sender code
3218 
3219   oop_maps->add_gc_map(__ pc() - start, map);
3220 
3221   __ reset_last_Java_frame(false);
3222 
3223   // Load UnrollBlock* into rdi
3224   __ mov(rdi, rax);
3225 
3226 #ifdef ASSERT
3227   { Label L;
3228     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
3229               Deoptimization::Unpack_uncommon_trap);
3230     __ jcc(Assembler::equal, L);
3231     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
3232     __ bind(L);
3233   }
3234 #endif
3235 
3236   // Pop all the frames we must move/replace.
3237   //
3238   // Frame picture (youngest to oldest)
3239   // 1: self-frame (no frame link)
3240   // 2: deopting frame  (no frame link)
3241   // 3: caller of deopting frame (could be compiled/interpreted).
3242 
3243   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
3244   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
3245 
3246   // Pop deoptimized frame (int)
3247   __ movl(rcx, Address(rdi,
3248                        Deoptimization::UnrollBlock::
3249                        size_of_deoptimized_frame_offset()));
3250   __ addptr(rsp, rcx);
3251 
3252   // rsp should be pointing at the return address to the caller (3)
3253 
3254   // Pick up the initial fp we should save
3255   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3256   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3257 
3258 #ifdef ASSERT
3259   // Compilers generate code that bang the stack by as much as the
3260   // interpreter would need. So this stack banging should never
3261   // trigger a fault. Verify that it does not on non product builds.
3262   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3263   __ bang_stack_size(rbx, rcx);
3264 #endif
3265 
3266   // Load address of array of frame pcs into rcx (address*)
3267   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3268 
3269   // Trash the return pc
3270   __ addptr(rsp, wordSize);
3271 
3272   // Load address of array of frame sizes into rsi (intptr_t*)
3273   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
3274 
3275   // Counter
3276   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
3277 
3278   // Now adjust the caller's stack to make up for the extra locals but
3279   // record the original sp so that we can save it in the skeletal
3280   // interpreter frame and the stack walking of interpreter_sender
3281   // will get the unextended sp value and not the "real" sp value.
3282 
3283   const Register sender_sp = r8;
3284 
3285   __ mov(sender_sp, rsp);
3286   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
3287   __ subptr(rsp, rbx);
3288 
3289   // Push interpreter frames in a loop
3290   Label loop;
3291   __ bind(loop);
3292   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3293   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3294   __ pushptr(Address(rcx, 0));     // Save return address
3295   __ enter();                      // Save old & set new rbp
3296   __ subptr(rsp, rbx);             // Prolog
3297   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3298             sender_sp);            // Make it walkable
3299   // This value is corrected by layout_activation_impl
3300   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3301   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3302   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3303   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3304   __ decrementl(rdx);              // Decrement counter
3305   __ jcc(Assembler::notZero, loop);
3306   __ pushptr(Address(rcx, 0));     // Save final return address
3307 
3308   // Re-push self-frame
3309   __ enter();                 // Save old & set new rbp
3310   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3311                               // Prolog
3312 
3313   // Use rbp because the frames look interpreted now
3314   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3315   // Don't need the precise return PC here, just precise enough to point into this code blob.
3316   address the_pc = __ pc();
3317   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3318 
3319   // Call C code.  Need thread but NOT official VM entry
3320   // crud.  We cannot block on this call, no GC can happen.  Call should
3321   // restore return values to their stack-slots with the new SP.
3322   // Thread is in rdi already.
3323   //
3324   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3325 
3326   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3327   __ mov(c_rarg0, r15_thread);
3328   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3329   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3330 
3331   // Set an oopmap for the call site
3332   // Use the same PC we used for the last java frame
3333   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3334 
3335   // Clear fp AND pc
3336   __ reset_last_Java_frame(true);
3337 
3338   // Pop self-frame.
3339   __ leave();                 // Epilog
3340 
3341   // Jump to interpreter
3342   __ ret(0);
3343 
3344   // Make sure all code is generated
3345   masm->flush();
3346 
3347   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3348                                                  SimpleRuntimeFrame::framesize >> 1);
3349 }
3350 #endif // COMPILER2
3351 
3352 //------------------------------generate_handler_blob------
3353 //
3354 // Generate a special Compile2Runtime blob that saves all registers,
3355 // and setup oopmap.
3356 //
3357 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3358   assert(StubRoutines::forward_exception_entry() != nullptr,
3359          "must be generated before");
3360 
3361   ResourceMark rm;
3362   OopMapSet *oop_maps = new OopMapSet();
3363   OopMap* map;
3364 
3365   // Allocate space for the code.  Setup code generation tools.
3366   CodeBuffer buffer("handler_blob", 2048, 1024);
3367   MacroAssembler* masm = new MacroAssembler(&buffer);
3368 
3369   address start   = __ pc();
3370   address call_pc = nullptr;
3371   int frame_size_in_words;
3372   bool cause_return = (poll_type == POLL_AT_RETURN);
3373   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3374 
3375   if (UseRTMLocking) {
3376     // Abort RTM transaction before calling runtime
3377     // because critical section will be large and will be
3378     // aborted anyway. Also nmethod could be deoptimized.
3379     __ xabort(0);
3380   }
3381 
3382   // Make room for return address (or push it again)
3383   if (!cause_return) {
3384     __ push(rbx);
3385   }
3386 
3387   // Save registers, fpu state, and flags
3388   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3389 
3390   // The following is basically a call_VM.  However, we need the precise
3391   // address of the call in order to generate an oopmap. Hence, we do all the
3392   // work ourselves.
3393 
3394   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3395 
3396   // The return address must always be correct so that frame constructor never
3397   // sees an invalid pc.
3398 
3399   if (!cause_return) {
3400     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3401     // Additionally, rbx is a callee saved register and we can look at it later to determine
3402     // if someone changed the return address for us!
3403     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3404     __ movptr(Address(rbp, wordSize), rbx);
3405   }
3406 
3407   // Do the call
3408   __ mov(c_rarg0, r15_thread);
3409   __ call(RuntimeAddress(call_ptr));
3410 
3411   // Set an oopmap for the call site.  This oopmap will map all
3412   // oop-registers and debug-info registers as callee-saved.  This
3413   // will allow deoptimization at this safepoint to find all possible
3414   // debug-info recordings, as well as let GC find all oops.
3415 
3416   oop_maps->add_gc_map( __ pc() - start, map);
3417 
3418   Label noException;
3419 
3420   __ reset_last_Java_frame(false);
3421 
3422   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3423   __ jcc(Assembler::equal, noException);
3424 
3425   // Exception pending
3426 
3427   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3428 
3429   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3430 
3431   // No exception case
3432   __ bind(noException);
3433 
3434   Label no_adjust;
3435 #ifdef ASSERT
3436   Label bail;
3437 #endif
3438   if (!cause_return) {
3439     Label no_prefix, not_special;
3440 
3441     // If our stashed return pc was modified by the runtime we avoid touching it
3442     __ cmpptr(rbx, Address(rbp, wordSize));
3443     __ jccb(Assembler::notEqual, no_adjust);
3444 
3445     // Skip over the poll instruction.
3446     // See NativeInstruction::is_safepoint_poll()
3447     // Possible encodings:
3448     //      85 00       test   %eax,(%rax)
3449     //      85 01       test   %eax,(%rcx)
3450     //      85 02       test   %eax,(%rdx)
3451     //      85 03       test   %eax,(%rbx)
3452     //      85 06       test   %eax,(%rsi)
3453     //      85 07       test   %eax,(%rdi)
3454     //
3455     //   41 85 00       test   %eax,(%r8)
3456     //   41 85 01       test   %eax,(%r9)
3457     //   41 85 02       test   %eax,(%r10)
3458     //   41 85 03       test   %eax,(%r11)
3459     //   41 85 06       test   %eax,(%r14)
3460     //   41 85 07       test   %eax,(%r15)
3461     //
3462     //      85 04 24    test   %eax,(%rsp)
3463     //   41 85 04 24    test   %eax,(%r12)
3464     //      85 45 00    test   %eax,0x0(%rbp)
3465     //   41 85 45 00    test   %eax,0x0(%r13)
3466 
3467     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3468     __ jcc(Assembler::notEqual, no_prefix);
3469     __ addptr(rbx, 1);
3470     __ bind(no_prefix);
3471 #ifdef ASSERT
3472     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3473 #endif
3474     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3475     // r12/rsp 0x04
3476     // r13/rbp 0x05
3477     __ movzbq(rcx, Address(rbx, 1));
3478     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3479     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3480     __ cmpptr(rcx, 1);
3481     __ jcc(Assembler::above, not_special);
3482     __ addptr(rbx, 1);
3483     __ bind(not_special);
3484 #ifdef ASSERT
3485     // Verify the correct encoding of the poll we're about to skip.
3486     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3487     __ jcc(Assembler::notEqual, bail);
3488     // Mask out the modrm bits
3489     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3490     // rax encodes to 0, so if the bits are nonzero it's incorrect
3491     __ jcc(Assembler::notZero, bail);
3492 #endif
3493     // Adjust return pc forward to step over the safepoint poll instruction
3494     __ addptr(rbx, 2);
3495     __ movptr(Address(rbp, wordSize), rbx);
3496   }
3497 
3498   __ bind(no_adjust);
3499   // Normal exit, restore registers and exit.
3500   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3501   __ ret(0);
3502 
3503 #ifdef ASSERT
3504   __ bind(bail);
3505   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3506 #endif
3507 
3508   // Make sure all code is generated
3509   masm->flush();
3510 
3511   // Fill-out other meta info
3512   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3513 }
3514 
3515 //
3516 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3517 //
3518 // Generate a stub that calls into vm to find out the proper destination
3519 // of a java call. All the argument registers are live at this point
3520 // but since this is generic code we don't know what they are and the caller
3521 // must do any gc of the args.
3522 //
3523 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3524   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3525 
3526   // allocate space for the code
3527   ResourceMark rm;
3528 
3529   CodeBuffer buffer(name, 1200, 512);
3530   MacroAssembler* masm = new MacroAssembler(&buffer);
3531 
3532   int frame_size_in_words;
3533 
3534   OopMapSet *oop_maps = new OopMapSet();
3535   OopMap* map = nullptr;
3536 
3537   int start = __ offset();
3538 
3539   // No need to save vector registers since they are caller-saved anyway.
3540   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3541 
3542   int frame_complete = __ offset();
3543 
3544   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3545 
3546   __ mov(c_rarg0, r15_thread);
3547 
3548   __ call(RuntimeAddress(destination));
3549 
3550 
3551   // Set an oopmap for the call site.
3552   // We need this not only for callee-saved registers, but also for volatile
3553   // registers that the compiler might be keeping live across a safepoint.
3554 
3555   oop_maps->add_gc_map( __ offset() - start, map);
3556 
3557   // rax contains the address we are going to jump to assuming no exception got installed
3558 
3559   // clear last_Java_sp
3560   __ reset_last_Java_frame(false);
3561   // check for pending exceptions
3562   Label pending;
3563   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3564   __ jcc(Assembler::notEqual, pending);
3565 
3566   // get the returned Method*
3567   __ get_vm_result_2(rbx, r15_thread);
3568   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3569 
3570   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3571 
3572   RegisterSaver::restore_live_registers(masm);
3573 
3574   // We are back to the original state on entry and ready to go.
3575 
3576   __ jmp(rax);
3577 
3578   // Pending exception after the safepoint
3579 
3580   __ bind(pending);
3581 
3582   RegisterSaver::restore_live_registers(masm);
3583 
3584   // exception pending => remove activation and forward to exception handler
3585 
3586   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3587 
3588   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3589   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3590 
3591   // -------------
3592   // make sure all code is generated
3593   masm->flush();
3594 
3595   // return the  blob
3596   // frame_size_words or bytes??
3597   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3598 }
3599 
3600 //------------------------------Montgomery multiplication------------------------
3601 //
3602 
3603 #ifndef _WINDOWS
3604 
3605 // Subtract 0:b from carry:a.  Return carry.
3606 static julong
3607 sub(julong a[], julong b[], julong carry, long len) {
3608   long long i = 0, cnt = len;
3609   julong tmp;
3610   asm volatile("clc; "
3611                "0: ; "
3612                "mov (%[b], %[i], 8), %[tmp]; "
3613                "sbb %[tmp], (%[a], %[i], 8); "
3614                "inc %[i]; dec %[cnt]; "
3615                "jne 0b; "
3616                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3617                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3618                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3619                : "memory");
3620   return tmp;
3621 }
3622 
3623 // Multiply (unsigned) Long A by Long B, accumulating the double-
3624 // length result into the accumulator formed of T0, T1, and T2.
3625 #define MACC(A, B, T0, T1, T2)                                  \
3626 do {                                                            \
3627   unsigned long hi, lo;                                         \
3628   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3629            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3630            : "r"(A), "a"(B) : "cc");                            \
3631  } while(0)
3632 
3633 // As above, but add twice the double-length result into the
3634 // accumulator.
3635 #define MACC2(A, B, T0, T1, T2)                                 \
3636 do {                                                            \
3637   unsigned long hi, lo;                                         \
3638   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3639            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3640            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3641            : "r"(A), "a"(B) : "cc");                            \
3642  } while(0)
3643 
3644 #else //_WINDOWS
3645 
3646 static julong
3647 sub(julong a[], julong b[], julong carry, long len) {
3648   long i;
3649   julong tmp;
3650   unsigned char c = 1;
3651   for (i = 0; i < len; i++) {
3652     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3653     a[i] = tmp;
3654   }
3655   c = _addcarry_u64(c, carry, ~0, &tmp);
3656   return tmp;
3657 }
3658 
3659 // Multiply (unsigned) Long A by Long B, accumulating the double-
3660 // length result into the accumulator formed of T0, T1, and T2.
3661 #define MACC(A, B, T0, T1, T2)                          \
3662 do {                                                    \
3663   julong hi, lo;                            \
3664   lo = _umul128(A, B, &hi);                             \
3665   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3666   c = _addcarry_u64(c, hi, T1, &T1);                    \
3667   _addcarry_u64(c, T2, 0, &T2);                         \
3668  } while(0)
3669 
3670 // As above, but add twice the double-length result into the
3671 // accumulator.
3672 #define MACC2(A, B, T0, T1, T2)                         \
3673 do {                                                    \
3674   julong hi, lo;                            \
3675   lo = _umul128(A, B, &hi);                             \
3676   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3677   c = _addcarry_u64(c, hi, T1, &T1);                    \
3678   _addcarry_u64(c, T2, 0, &T2);                         \
3679   c = _addcarry_u64(0, lo, T0, &T0);                    \
3680   c = _addcarry_u64(c, hi, T1, &T1);                    \
3681   _addcarry_u64(c, T2, 0, &T2);                         \
3682  } while(0)
3683 
3684 #endif //_WINDOWS
3685 
3686 // Fast Montgomery multiplication.  The derivation of the algorithm is
3687 // in  A Cryptographic Library for the Motorola DSP56000,
3688 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3689 
3690 static void NOINLINE
3691 montgomery_multiply(julong a[], julong b[], julong n[],
3692                     julong m[], julong inv, int len) {
3693   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3694   int i;
3695 
3696   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3697 
3698   for (i = 0; i < len; i++) {
3699     int j;
3700     for (j = 0; j < i; j++) {
3701       MACC(a[j], b[i-j], t0, t1, t2);
3702       MACC(m[j], n[i-j], t0, t1, t2);
3703     }
3704     MACC(a[i], b[0], t0, t1, t2);
3705     m[i] = t0 * inv;
3706     MACC(m[i], n[0], t0, t1, t2);
3707 
3708     assert(t0 == 0, "broken Montgomery multiply");
3709 
3710     t0 = t1; t1 = t2; t2 = 0;
3711   }
3712 
3713   for (i = len; i < 2*len; i++) {
3714     int j;
3715     for (j = i-len+1; j < len; j++) {
3716       MACC(a[j], b[i-j], t0, t1, t2);
3717       MACC(m[j], n[i-j], t0, t1, t2);
3718     }
3719     m[i-len] = t0;
3720     t0 = t1; t1 = t2; t2 = 0;
3721   }
3722 
3723   while (t0)
3724     t0 = sub(m, n, t0, len);
3725 }
3726 
3727 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3728 // multiplies so it should be up to 25% faster than Montgomery
3729 // multiplication.  However, its loop control is more complex and it
3730 // may actually run slower on some machines.
3731 
3732 static void NOINLINE
3733 montgomery_square(julong a[], julong n[],
3734                   julong m[], julong inv, int len) {
3735   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3736   int i;
3737 
3738   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3739 
3740   for (i = 0; i < len; i++) {
3741     int j;
3742     int end = (i+1)/2;
3743     for (j = 0; j < end; j++) {
3744       MACC2(a[j], a[i-j], t0, t1, t2);
3745       MACC(m[j], n[i-j], t0, t1, t2);
3746     }
3747     if ((i & 1) == 0) {
3748       MACC(a[j], a[j], t0, t1, t2);
3749     }
3750     for (; j < i; j++) {
3751       MACC(m[j], n[i-j], t0, t1, t2);
3752     }
3753     m[i] = t0 * inv;
3754     MACC(m[i], n[0], t0, t1, t2);
3755 
3756     assert(t0 == 0, "broken Montgomery square");
3757 
3758     t0 = t1; t1 = t2; t2 = 0;
3759   }
3760 
3761   for (i = len; i < 2*len; i++) {
3762     int start = i-len+1;
3763     int end = start + (len - start)/2;
3764     int j;
3765     for (j = start; j < end; j++) {
3766       MACC2(a[j], a[i-j], t0, t1, t2);
3767       MACC(m[j], n[i-j], t0, t1, t2);
3768     }
3769     if ((i & 1) == 0) {
3770       MACC(a[j], a[j], t0, t1, t2);
3771     }
3772     for (; j < len; j++) {
3773       MACC(m[j], n[i-j], t0, t1, t2);
3774     }
3775     m[i-len] = t0;
3776     t0 = t1; t1 = t2; t2 = 0;
3777   }
3778 
3779   while (t0)
3780     t0 = sub(m, n, t0, len);
3781 }
3782 
3783 // Swap words in a longword.
3784 static julong swap(julong x) {
3785   return (x << 32) | (x >> 32);
3786 }
3787 
3788 // Copy len longwords from s to d, word-swapping as we go.  The
3789 // destination array is reversed.
3790 static void reverse_words(julong *s, julong *d, int len) {
3791   d += len;
3792   while(len-- > 0) {
3793     d--;
3794     *d = swap(*s);
3795     s++;
3796   }
3797 }
3798 
3799 // The threshold at which squaring is advantageous was determined
3800 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3801 #define MONTGOMERY_SQUARING_THRESHOLD 64
3802 
3803 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3804                                         jint len, jlong inv,
3805                                         jint *m_ints) {
3806   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3807   int longwords = len/2;
3808 
3809   // Make very sure we don't use so much space that the stack might
3810   // overflow.  512 jints corresponds to an 16384-bit integer and
3811   // will use here a total of 8k bytes of stack space.
3812   int divisor = sizeof(julong) * 4;
3813   guarantee(longwords <= 8192 / divisor, "must be");
3814   int total_allocation = longwords * sizeof (julong) * 4;
3815   julong *scratch = (julong *)alloca(total_allocation);
3816 
3817   // Local scratch arrays
3818   julong
3819     *a = scratch + 0 * longwords,
3820     *b = scratch + 1 * longwords,
3821     *n = scratch + 2 * longwords,
3822     *m = scratch + 3 * longwords;
3823 
3824   reverse_words((julong *)a_ints, a, longwords);
3825   reverse_words((julong *)b_ints, b, longwords);
3826   reverse_words((julong *)n_ints, n, longwords);
3827 
3828   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3829 
3830   reverse_words(m, (julong *)m_ints, longwords);
3831 }
3832 
3833 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3834                                       jint len, jlong inv,
3835                                       jint *m_ints) {
3836   assert(len % 2 == 0, "array length in montgomery_square must be even");
3837   int longwords = len/2;
3838 
3839   // Make very sure we don't use so much space that the stack might
3840   // overflow.  512 jints corresponds to an 16384-bit integer and
3841   // will use here a total of 6k bytes of stack space.
3842   int divisor = sizeof(julong) * 3;
3843   guarantee(longwords <= (8192 / divisor), "must be");
3844   int total_allocation = longwords * sizeof (julong) * 3;
3845   julong *scratch = (julong *)alloca(total_allocation);
3846 
3847   // Local scratch arrays
3848   julong
3849     *a = scratch + 0 * longwords,
3850     *n = scratch + 1 * longwords,
3851     *m = scratch + 2 * longwords;
3852 
3853   reverse_words((julong *)a_ints, a, longwords);
3854   reverse_words((julong *)n_ints, n, longwords);
3855 
3856   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3857     ::montgomery_square(a, n, m, (julong)inv, longwords);
3858   } else {
3859     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3860   }
3861 
3862   reverse_words(m, (julong *)m_ints, longwords);
3863 }
3864 
3865 #ifdef COMPILER2
3866 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3867 //
3868 //------------------------------generate_exception_blob---------------------------
3869 // creates exception blob at the end
3870 // Using exception blob, this code is jumped from a compiled method.
3871 // (see emit_exception_handler in x86_64.ad file)
3872 //
3873 // Given an exception pc at a call we call into the runtime for the
3874 // handler in this method. This handler might merely restore state
3875 // (i.e. callee save registers) unwind the frame and jump to the
3876 // exception handler for the nmethod if there is no Java level handler
3877 // for the nmethod.
3878 //
3879 // This code is entered with a jmp.
3880 //
3881 // Arguments:
3882 //   rax: exception oop
3883 //   rdx: exception pc
3884 //
3885 // Results:
3886 //   rax: exception oop
3887 //   rdx: exception pc in caller or ???
3888 //   destination: exception handler of caller
3889 //
3890 // Note: the exception pc MUST be at a call (precise debug information)
3891 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3892 //
3893 
3894 void OptoRuntime::generate_exception_blob() {
3895   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3896   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3897   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3898 
3899   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3900 
3901   // Allocate space for the code
3902   ResourceMark rm;
3903   // Setup code generation tools
3904   CodeBuffer buffer("exception_blob", 2048, 1024);
3905   MacroAssembler* masm = new MacroAssembler(&buffer);
3906 
3907 
3908   address start = __ pc();
3909 
3910   // Exception pc is 'return address' for stack walker
3911   __ push(rdx);
3912   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3913 
3914   // Save callee-saved registers.  See x86_64.ad.
3915 
3916   // rbp is an implicitly saved callee saved register (i.e., the calling
3917   // convention will save/restore it in the prolog/epilog). Other than that
3918   // there are no callee save registers now that adapter frames are gone.
3919 
3920   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3921 
3922   // Store exception in Thread object. We cannot pass any arguments to the
3923   // handle_exception call, since we do not want to make any assumption
3924   // about the size of the frame where the exception happened in.
3925   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3926   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3927   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3928 
3929   // This call does all the hard work.  It checks if an exception handler
3930   // exists in the method.
3931   // If so, it returns the handler address.
3932   // If not, it prepares for stack-unwinding, restoring the callee-save
3933   // registers of the frame being removed.
3934   //
3935   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3936 
3937   // At a method handle call, the stack may not be properly aligned
3938   // when returning with an exception.
3939   address the_pc = __ pc();
3940   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3941   __ mov(c_rarg0, r15_thread);
3942   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3943   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3944 
3945   // Set an oopmap for the call site.  This oopmap will only be used if we
3946   // are unwinding the stack.  Hence, all locations will be dead.
3947   // Callee-saved registers will be the same as the frame above (i.e.,
3948   // handle_exception_stub), since they were restored when we got the
3949   // exception.
3950 
3951   OopMapSet* oop_maps = new OopMapSet();
3952 
3953   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3954 
3955   __ reset_last_Java_frame(false);
3956 
3957   // Restore callee-saved registers
3958 
3959   // rbp is an implicitly saved callee-saved register (i.e., the calling
3960   // convention will save restore it in prolog/epilog) Other than that
3961   // there are no callee save registers now that adapter frames are gone.
3962 
3963   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3964 
3965   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3966   __ pop(rdx);                  // No need for exception pc anymore
3967 
3968   // rax: exception handler
3969 
3970   // We have a handler in rax (could be deopt blob).
3971   __ mov(r8, rax);
3972 
3973   // Get the exception oop
3974   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3975   // Get the exception pc in case we are deoptimized
3976   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3977 #ifdef ASSERT
3978   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3979   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3980 #endif
3981   // Clear the exception oop so GC no longer processes it as a root.
3982   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3983 
3984   // rax: exception oop
3985   // r8:  exception handler
3986   // rdx: exception pc
3987   // Jump to handler
3988 
3989   __ jmp(r8);
3990 
3991   // Make sure all code is generated
3992   masm->flush();
3993 
3994   // Set exception blob
3995   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3996 }
3997 #endif // COMPILER2
3998 
3999 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
4000   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
4001   CodeBuffer buffer(buf);
4002   short buffer_locs[20];
4003   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
4004                                          sizeof(buffer_locs)/sizeof(relocInfo));
4005 
4006   MacroAssembler* masm = new MacroAssembler(&buffer);
4007 
4008   const Array<SigEntry>* sig_vk = vk->extended_sig();
4009   const Array<VMRegPair>* regs = vk->return_regs();
4010 
4011   int pack_fields_jobject_off = __ offset();
4012   // Resolve pre-allocated buffer from JNI handle.
4013   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
4014   __ movptr(rax, Address(r13, 0));
4015   __ resolve_jobject(rax /* value */,
4016                      r15_thread /* thread */,
4017                      r12 /* tmp */);
4018   __ movptr(Address(r13, 0), rax);
4019 
4020   int pack_fields_off = __ offset();
4021 
4022   int j = 1;
4023   for (int i = 0; i < sig_vk->length(); i++) {
4024     BasicType bt = sig_vk->at(i)._bt;
4025     if (bt == T_METADATA) {
4026       continue;
4027     }
4028     if (bt == T_VOID) {
4029       if (sig_vk->at(i-1)._bt == T_LONG ||
4030           sig_vk->at(i-1)._bt == T_DOUBLE) {
4031         j++;
4032       }
4033       continue;
4034     }
4035     int off = sig_vk->at(i)._offset;
4036     assert(off > 0, "offset in object should be positive");
4037     VMRegPair pair = regs->at(j);
4038     VMReg r_1 = pair.first();
4039     VMReg r_2 = pair.second();
4040     Address to(rax, off);
4041     if (bt == T_FLOAT) {
4042       __ movflt(to, r_1->as_XMMRegister());
4043     } else if (bt == T_DOUBLE) {
4044       __ movdbl(to, r_1->as_XMMRegister());
4045     } else {
4046       Register val = r_1->as_Register();
4047       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
4048       if (is_reference_type(bt)) {
4049         __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
4050       } else {
4051         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
4052       }
4053     }
4054     j++;
4055   }
4056   assert(j == regs->length(), "missed a field?");
4057 
4058   __ ret(0);
4059 
4060   int unpack_fields_off = __ offset();
4061 
4062   Label skip;
4063   __ testptr(rax, rax);
4064   __ jcc(Assembler::zero, skip);
4065 
4066   j = 1;
4067   for (int i = 0; i < sig_vk->length(); i++) {
4068     BasicType bt = sig_vk->at(i)._bt;
4069     if (bt == T_METADATA) {
4070       continue;
4071     }
4072     if (bt == T_VOID) {
4073       if (sig_vk->at(i-1)._bt == T_LONG ||
4074           sig_vk->at(i-1)._bt == T_DOUBLE) {
4075         j++;
4076       }
4077       continue;
4078     }
4079     int off = sig_vk->at(i)._offset;
4080     assert(off > 0, "offset in object should be positive");
4081     VMRegPair pair = regs->at(j);
4082     VMReg r_1 = pair.first();
4083     VMReg r_2 = pair.second();
4084     Address from(rax, off);
4085     if (bt == T_FLOAT) {
4086       __ movflt(r_1->as_XMMRegister(), from);
4087     } else if (bt == T_DOUBLE) {
4088       __ movdbl(r_1->as_XMMRegister(), from);
4089     } else if (bt == T_OBJECT || bt == T_ARRAY) {
4090       assert_different_registers(rax, r_1->as_Register());
4091       __ load_heap_oop(r_1->as_Register(), from);
4092     } else {
4093       assert(is_java_primitive(bt), "unexpected basic type");
4094       assert_different_registers(rax, r_1->as_Register());
4095       size_t size_in_bytes = type2aelembytes(bt);
4096       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
4097     }
4098     j++;
4099   }
4100   assert(j == regs->length(), "missed a field?");
4101 
4102   __ bind(skip);
4103   __ ret(0);
4104 
4105   __ flush();
4106 
4107   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
4108 }